Assess the presence and abundance of Gardnerella spp. and their impacts on the vaginal microbiomes of pregnant women from three distinct cohorts.

Set up

Load Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(kableExtra)
library(broom)
library(formattable)
library(reshape2)
library(vegan)
library(ggbeeswarm)
library(ggExtra)
 library(coin)
library(cowplot)
library(rstatix)
library(ggpubr)
`%!in%` <- negate(`%in%`)
set.seed(7334)

Define file paths

figureOut <- "../FIGURES/resubmission_manuscript_figures"

tmpFigureOut <- "../FIGURES/resubmission_manuscript_figures/tmp" # folder for storing temp files for figure formatting

suDF <- "../METADATA/sumgsDF.tsv" %>% # metadata for Stanford and UAB
  read_tsv %>%
  mutate(SampleID=as.character(SampleID))

hmp2DF <- "../METADATA/hmp2mgsDF.tsv" %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID),
         Cohort="MOMS-PI")

# sample data
sgDF0 <- suDF %>%
  select("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs") %>%
  mutate(Cohort=paste(Cohort, "Enriched")) %>%
  full_join(hmp2DF[,c("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs")], by = c("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs")) %>%
  mutate(Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI")), # Stanford and UAB samples were selected for enrichment in Gardnerella, label them has Enriched.
         SampleID=as.character(SampleID), 
         SubjectID=as.character(SubjectID),
         pctHuman=((Sickle_pairs-bbmapFiltered_pairs)/Sickle_pairs)*100, # calculate percent human reads
         bacLoad=bbmapFiltered_pairs/(Sickle_pairs-bbmapFiltered_pairs)) # calculate microbial load

Taxa abundance tables Stanford and UAB cohorts:

suMetaphlanPath <- "./metaphlan4/stuab_mergedMetaphlanAbundance.tsv"
suGardCladePath <- "./gardCladeRelativeAbundance.tsv"
suGardGenomoPath <- "./gardGenomospeciesRelativeAbundance.tsv"

MOMS-PI (HMP2) cohort

hmp2MetaphlanPath <- "./metaphlan4/hmp2mgs_mergedMetaphlanAbundance.tsv"
hmp2GardCladePath <- "./hmp2gardCladeRelativeAbundance.tsv"
hmp2GardGenomoPath <- "./hmp2gardGenomospeciesRelativeAbundance.tsv"

ggplot settings

Themes:

theme_set(theme_bw()+
          theme(panel.grid = element_blank(),
                axis.text = element_text(size=10),
                axis.title = element_text(size=15),
                legend.text = element_text(size=12),
                legend.title = element_text(size=14)))

Clade colors:

cladeColors <-  list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73"), labels=c("C1", "C2", "C3", "C4", "C5", "C6")),
                     scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73"), labels=c("C1", "C2", "C3", "C4", "C5", "C6")))

cladeColorsUnch <-  list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)))),
                     scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)))))

cladeColorsTotal <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Total'~italic(Gardnerella)))),
                         scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Total'~italic(Gardnerella)))))

cladeColorsTotalUnch <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)),  bquote('Total'~italic(Gardnerella)))),
                         scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)),  bquote('Total'~italic(Gardnerella)))))

genomoColorsTotal <- list(scale_color_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Total'~italic(Gardnerella)))),
                     scale_fill_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Total'~italic(Gardnerella)))))


genomoColorsTotalUnch <- list(scale_color_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#A9A9A9", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Uncharacterized'~italic(Gardnerella)),
                              bquote('Total'~italic(Gardnerella)))),
                     scale_fill_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#A9A9A9", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Uncharacterized'~italic(Gardnerella)),
                              bquote('Total'~italic(Gardnerella)))))

Gardnerella and Lactobacillus Colors:

gLColors <-   list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73","darkgreen", "khaki1", "darkblue", "darkred"), labels=c("G. vaginalis C1", "G. vaginalis C2", "G. vaginalis C3", "G. vaginalis C4", "G. vaginalis C5", "G. vaginalis C6", "L. crispatus", "L. gasseri", "L. iners", "L. jensenii")),
  scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73","darkgreen", "khaki1", "darkblue", "darkred", "darkseagreen4"), labels=c("G. vaginalis C1", "G. vaginalis C2", "G. vaginalis C3", "G. vaginalis C4", "G. vaginalis C5", "G. vaginalis C6", "L. crispatus", "L. gasseri", "L. iners", "L. jensenii")))

All Taxa Colors:

taxColors <-   list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"),
                                        labels=c("C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("F. vaginae")), bquote(italic("M. lornae")), bquote(italic("P. amnii")), bquote(italic("S. vaginalis"))
                                        )),
                     scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"),
                                        labels=c("C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("F. vaginae")), bquote(italic("M. lornae")), bquote(italic("P. amnii")), bquote(italic("S. vaginalis"))
                                        )))

taxColorsOther <-   list(scale_color_manual(values=c("#808080", "#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"),
                                        labels=c("Other", "C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("F. vaginae")), bquote(italic("M. lornae")), bquote(italic("P. amnii")), bquote(italic("S. vaginalis"))
                                        )),
                     scale_fill_manual(values=c("#808080", "#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"),
                                        labels=c("Other", "C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("F. vaginae")), bquote(italic("M. lornae")), bquote(italic("P. amnii")), bquote(italic("S. vaginalis"))
                                        )))

Bi/Tri/Quad Colors:

binaryColors <- list(scale_color_manual(values = c("#D55E00", "#56B4E9")),
                     scale_fill_manual(values=c("#D55E00", "#56B4E9")))

trinaryColors <- list(scale_color_manual(values = c("#F77754", "#018790", "#0A516D")),
                      scale_fill_manual(values = c(c("#F77754", "#018790", "#0A516D"))))

quadColors <- list(scale_color_manual(values = c("#F77754", "#018790", "#0A516D", "#2B2726")),
                      scale_fill_manual(values = c(c("#F77754", "#018790", "#0A516D", "#2B2726")))) 

quadColors2 <- list(scale_color_manual(values = c("#D55E00", "#009E73", "#CC79A7", "#56B4E9")),
                     scale_fill_manual(values=c("#D55E00","#009E73", "#CC79A7", "#56B4E9")))

Other shortcuts

Create some shortcut lists for convenience.

## Other shortcuts
clades <- c("C1", "C2", "C3", "C4", "C5", "C6")
genomos <- c("GS1", "GS2", "GS3", "GS4", "GS5", "GS6", "GS7", "GS8", "GS9", "GS10", "GS11", "GS12", "GS13", "GS14")
genomoNames <- c("G. vag.", "GS2", "GS3", "G. pio.", "G. leo.", "G. swi.", "GS7", "GS8", "GS9", "GS10", "GS11", "GS12", "GS13", "GS14")
genomosCladeOrder <- c("GS1", "GS2", "GS3", "GS4", "GS11", "GS7", "GS8", "GS9", "GS10", "GS14", "GS5", "GS6", "GS12", "GS13")
genomoNamesCladeOrder <- c("G. vag.", "GS2", "GS3", "G. pio.", "GS11", "GS7", "GS8", "GS9", "GS10", "GS14", "G. leo.", "G. swi.", "GS12", "GS13")

lactos <- c("Lactobacillus_crispatus", "Lactobacillus_gasseri", "Lactobacillus_jensenii", "Lactobacillus_iners")
anaerobes <- c("Fannyhessea_vaginae", "Finegoldia_magna",  "Mycoplasma_hominis", "Megasphaera_lornae", "Prevotella_amnii", "Prevotella_bivia", "Prevotella_timonensis", "Sneathia_vaginalis",  "Ureaplasma_parvum")
anaerobesSL <- c("Fannyhessea_vaginae", "Megasphaera_lornae", "Prevotella_amnii", "Sneathia_vaginalis") # short list of anaerobes

abbrevs <- c("TG", clades, "Lc", "Lg", "Lj", "Li", "Fv", "Fm", "Mh", "Ml", "Pa", "Pb", "Pt", "Sv", "Up")
abbrevsSL <- c("TG", clades, "Lc", "Lg", "Lj", "Li",  "Fv", "Ml", "Pa", "Sv")  # short list of abbreviations

cohorts <- c("Stanford Enriched", "UAB Enriched", "MOMS-PI Enriched", "MOMS-PI")
cohortAbbrevs <- c("Stanford Enr.", "UAB Enr.", "MOMS-PI Enr.", "MOMS-PI")

Remove low-read samples

Starting number of samples and subjects:

sgDF0 %>%
  group_by(Cohort) %>%
  summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
  formattable()
Cohort N Subjects N Samples
Stanford Enriched 20 62
UAB Enriched 15 45
MOMS-PI 234 930

Library sizes

Paired reads in total library size, after quality filtering by Sickle, and after removing human reads.

sgDF0 %>%
  gather("Step", "paired_reads", c(TotalPairs, Sickle_pairs, bbmapFiltered_pairs)) %>%
  mutate(Step=factor(Step, levels=c("TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs"), labels=c("Library Size", "Sickle Filtered", "Microbial Reads"))) %>%
  group_by(Step, Cohort) %>%
  summarise(min=min(paired_reads), mean=mean(paired_reads), max=max(paired_reads)) %>%
  mutate_if(is.numeric, ~prettyNum(.x, big.mark = ",")) %>%
  formattable()
Step Cohort min mean max
Library Size Stanford Enriched 14,655,421 19,681,739 29,263,108
Library Size UAB Enriched 13,081,280 19,239,013 35,557,864
Library Size MOMS-PI 12,210 10,162,162 89,647,588
Sickle Filtered Stanford Enriched 12,757,737 17,122,069 24,968,647
Sickle Filtered UAB Enriched 11,695,954 16,964,926 30,690,425
Sickle Filtered MOMS-PI 0 6,877,062 78,549,522
Microbial Reads Stanford Enriched 45,287 2,108,541 17,576,674
Microbial Reads UAB Enriched 81,037 3,811,026 15,652,482
Microbial Reads MOMS-PI 0 669,755.7 30,053,898

Percent filtered at each step

sgDF0 %>%
  mutate(`Sickle % Total`=Sickle_pairs/TotalPairs,
         `BBmap % Sickle`=bbmapFiltered_pairs/Sickle_pairs,
         `BBmap % Total`=bbmapFiltered_pairs/TotalPairs) %>%
  group_by(Cohort) %>%
  summarise_at(c("Sickle % Total",  "BBmap % Sickle", "BBmap % Total"), ~median(.x, na.rm=TRUE)) %>%
  formattable()
Cohort Sickle % Total BBmap % Sickle BBmap % Total
Stanford Enriched 0.8685448 0.04800637 0.04134656
UAB Enriched 0.8793049 0.20023969 0.17588516
MOMS-PI 0.7215741 0.04755454 0.02472847

sickle_pct_total = percent of paired reads left after sickle filtering bbmap_pct_sicke = percent of sickle paired reads left after removing human reads bbmap_pct_total = percent of all paired reads left after sickle filtering and removing human reads

View number of paired reads after sickle and human filtering per sample and determine filtering threshold.

p <- sgDF0 %>%
  ggplot(aes(x=Sickle_pairs, y=bbmapFiltered_pairs, fill=Cohort, color=Cohort)) +
  geom_point(alpha=0.25) +
  trinaryColors +
  geom_vline(xintercept = 1e6, color="grey", linetype="dashed") + # 1 million paired reads after sickle
  scale_x_log10(n.breaks=10) +
  scale_y_log10(n.breaks=10) +
  theme_bw() +
  theme(legend.position = "bottom",
        axis.text = element_text(size=10),
        axis.title = element_text(size=15),
        legend.text = element_text(size=12),
        legend.title = element_text(size=14)) +
  labs(x="QC Filtered Paired Reads", y="Microbial Paired Reads")
ggMarginal(p, groupFill = TRUE, type="density")

Remove the samples with fewer than 1 million paired reads after Sickle filtering and trimming.

QCfilterSamples <- sgDF0 %>%
  filter(Sickle_pairs>1e6) %>% # list of samples that pass QC filtering criteria and will be used in analyses
  .$SampleID
length(QCfilterSamples) # number of samples that will be used in analyses
## [1] 888
sgDF1 <- sgDF0 %>%
  filter(SampleID %in% QCfilterSamples)

Number of samples remaining in each cohort:

sgDF1 %>%
  group_by(Cohort) %>%
  summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
  formattable()
Cohort N Subjects N Samples
Stanford Enriched 20 62
UAB Enriched 15 45
MOMS-PI 231 781

Will proceed with a total of 888 samples, 62 from Stanford, 45 from UAB, and 781 from MOMS-PI

Library sizes and percent filtered at each step post QC sample removal:

readCountsPlot <- sgDF1 %>%
  gather("Step", "paired_reads", c(TotalPairs, Sickle_pairs, bbmapFiltered_pairs)) %>%
  mutate(Step=factor(Step, levels=c("TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs"), labels=c("Library Size", "Sickle Filtered", "Microbial Reads")),
         Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI"), labels=c("Sanford\nEnriched", "UAB\nEnriched", "MOMS-PI"))) %>%
  ggplot(aes(x=Cohort, y=paired_reads, fill=Cohort, color=Cohort)) +
  geom_quasirandom(dodge.width = 0.9, alpha=0.5, size=1, show.legend = FALSE) +
  geom_boxplot(alpha=0, position=position_dodge(width=0.9), color="black", show.legend = FALSE) +
  scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
  facet_wrap(~Step) +
  trinaryColors +
  theme_bw() +
  theme(axis.text.x = element_text(size=10),
        axis.text.y=element_text(size=13),
        strip.text = element_text(size=13),
        axis.title = element_text(size=16), 
        aspect.ratio = 1.5) +
  labs(x=NULL, y="Paired Reads")

Percent Human Reads

percentHumanHists <- sgDF1 %>%
  group_by(Cohort) %>%
  ggplot(aes(pctHuman)) +
  geom_histogram(aes(y=stat(width*density))) +
  scale_y_continuous(limits = c(0,0.4), n.breaks=9) +
  facet_wrap(~Cohort) +
  theme(axis.text = element_text(size=13),
        axis.title = element_text(size=16),
        strip.text = element_text(size=13),
        aspect.ratio = 1.5) +
  labs(x="Percent Human Reads", y="Proportion of Samples")

MOMS-PI samples had greatest percent of human reads overall. They were not selected for enrichment in Gardnerella. Samples in UAB cohort had the smallest percent of human reads.

Summary table of library sizes and percent human reads of QC filtered samples

readsTable <- sgDF1 %>%
  group_by(Cohort) %>%
  summarise_at(vars(TotalPairs, Sickle_pairs, bbmapFiltered_pairs, pctHuman), list(mean=mean, sd=sd)) %>%
  mutate_at(vars(pctHuman_mean, pctHuman_sd), ~round(.x, 2)) %>%
  mutate_if(is.numeric, ~prettyNum(.x, big.mark = ",")) %>%
  transmute(Cohort=Cohort,
            `Library Size`=paste0(TotalPairs_mean, " (", TotalPairs_sd, ")"),
            `Sickle Filtered`=paste0(Sickle_pairs_mean, " (", Sickle_pairs_sd, ")"),
            `Microbial Reads`=paste0(bbmapFiltered_pairs_mean, " (", bbmapFiltered_pairs_sd, ")"),
            `Percent Human Reads`=paste0(pctHuman_mean, " (", pctHuman_sd, ")")) %>%
  kbl() %>%
  kable_classic(full_width=TRUE, html_font = "Arial")

#save table to tmp file for adding to rest of read count figure

readsTablePngPath <- file.path(tmpFigureOut, paste(Sys.Date(), "readsTableComponent.png", sep="_"))

readsTable %>%
  save_kable(file = readsTablePngPath, zoom=4)

Figure S4: Read counts and percent human reads

#fig.height=2.5, fig.width=7
readsAB <- plot_grid(readCountsPlot, percentHumanHists, nrow = 1, align = "hv", axis="tb", labels = c("A", "B"), label_size = 15)

printReadsTable <- ggdraw() + draw_image(readsTablePngPath)

plot_grid(readsAB, printReadsTable, nrow=2, rel_heights = c(1, 0.25), align = "v", axis = "l", labels = c("", "C"), label_size = 15)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS4_ReadCounts.png", sep="_")))

Metadata

Description of cohorts and sample collection schemes

Figure S3: Gestational age at collection

#fig.height=3, fig.width=5
samplingSched <- sgDF1 %>%
  filter(!is.na(GDcoll)) %>%
  mutate(GWcoll=GDcoll/7,
         GWcoll=floor(GWcoll),
         Cohort=factor(Cohort, levels=c("Stanford Enriched", "UAB Enriched", "MOMS-PI"))) %>%
  split(.$Cohort) %>%
  map(., ~ggplot(.x) +
    geom_point(aes(x=ceiling(GWcoll), y=SubjectID), size=1, alpha=0.5) +
    geom_point(aes(x=(GWdel+0.5), y=SubjectID), shape=")") +
    facet_grid(.~Cohort) +
    theme(axis.text = element_text(size=13),
        axis.title = element_text(size=16),
        strip.text = element_text(size=13),
        aspect.ratio = 1.5) +
    labs(x=NULL, y=NULL))

stanfordSamplingSched <- samplingSched$`Stanford Enriched` +
  labs(y="Subject ID")

uabSamplingSched <-  samplingSched$`UAB Enriched` +
  labs(x="Gestation Weeks")

momspiSamplingSched <- samplingSched$`MOMS-PI` +
  theme(axis.text.y = element_text(size=3))

plot_grid(stanfordSamplingSched, uabSamplingSched, momspiSamplingSched, rel_widths = c(1, 1, 1), rel_heights = c(1, 1, 1), align = "hv", nrow = 1) 

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS3_samplingSchedule.png", sep="_")))
sgDF1 %>%
  filter(!is.na(GDcoll)) %>%
  mutate(GWcoll=GDcoll/7,
         GWcoll=floor(GWcoll)) %>%
  group_by(Cohort) %>%
  summarize(across(GWcoll, list(median=median, `25th`=~quantile(.x, 0.25), `75th`=~quantile(.x, 0.75))))
## # A tibble: 3 × 4
##   Cohort            GWcoll_median GWcoll_25th GWcoll_75th
##   <fct>                     <dbl>       <dbl>       <dbl>
## 1 Stanford Enriched            22          16          31
## 2 UAB Enriched                 26          19          32
## 3 MOMS-PI                      29          20          36

Get MOMS-PI subjects with gestational age at delivery

hmp2SubjectsWDel <- sgDF1 %>%
  filter(Cohort=="MOMS-PI",
         !is.na(TermPre)) %>%
  .$SubjectID %>%
  unique

head(hmp2SubjectsWDel)
## [1] "2442683" "2442687" "2442892" "2442905" "2442618" "2443013"
length(hmp2SubjectsWDel)
## [1] 133

Table 1: Demographic and Clinical data

#GAAD 
gaadDF <- sgDF1 %>%
  select(SubjectID, Cohort, TermPre, GWdel) %>%
  unique() %>%
  add_count(Cohort, TermPre) %>%
  group_by(Cohort, TermPre, n) %>%
  summarize(mean=round(mean(GWdel, na.rm=TRUE), 1), 
            sd=round(sd(GWdel, na.rm = TRUE), 1)) %>%
  replace_na(list(mean=0, sd=0)) %>%
  mutate_at(c("mean", "sd"), as.character) %>%
  ungroup() %>%
  mutate(mean=case_when(mean=="0"~"-",
                       mean!="0"~mean),
         sd=case_when(sd=="0"~"-",
                     sd!="0"~sd),
         TermPre=case_when(TermPre=="T"~"Term",
                           TermPre=="P"~"Preterm",
                           is.na(TermPre)~"Unknown"),
          cat=paste0(Cohort, "\n", TermPre, " (n=", n, ")"),
         `Mean gestational age in weeks at delivery (±SD)`=paste0(mean, " (", sd, ")")) %>%
  select(cat, `Mean gestational age in weeks at delivery (±SD)`) %>%
  column_to_rownames(var="cat") %>%
  t %>%
  as.data.frame %>%
  rownames_to_column(var="x") %>%
  select("x", "Stanford Enriched\nTerm (n=11)", "Stanford Enriched\nPreterm (n=9)", "UAB Enriched\nTerm (n=5)", "UAB Enriched\nPreterm (n=10)", "MOMS-PI\nTerm (n=90)", "MOMS-PI\nPreterm (n=43)", "MOMS-PI\nUnknown (n=98)")

# Demographics
demDF <- sgDF1 %>%
  select(SubjectID, Cohort, TermPre, Age, Race, Ethnicity, Education, Income, deliveryMode) %>%
  unique() %>%
  mutate(Cohort=as.character(Cohort)) %>%
  add_count(Cohort, TermPre, name="term_count") %>%
  mutate(TermPre=case_when(TermPre=="T"~"Term",
                           TermPre=="P"~"Preterm",
                           is.na(TermPre)~"Unknown"),
         cat=paste0(Cohort, "\n", TermPre, " (n=", term_count, ")"),
         Income=case_when(Income %in% c("No income", "Under $10,000", "$10,000 - $30,000", "$30,000 - $50,000", "$50,000 - $60,000", "Under $15,000", "$15,000 - $19,999", "$20,000 - $39,999", "$40,000 - $59,999", "$60,000 - $79,999")~"Under $80,000", #fix income levels
                          Income %in% c("$80,000 - $100,000", "$150,000 - $200,000", "$250,000 or more", "$80,000 or more")~"$80,000 or more",
                          Income=="Unknown"~"Unknown")) %>% 
  select(-SubjectID, -TermPre, -Cohort, -term_count) %>%
  map2_df(., data.frame(matrix(rep(.$cat, 7), ncol=7)), ~count(data.frame(x=.x, y=.y), x, y), .id="z") %>%
  filter(z!="cat") %>%
  group_by(z,y) %>%
  mutate(n=paste0(n, " (", round(n/sum(n)*100, 0), "%)")) %>%
  ungroup %>%
  spread(y, n) %>%
  mutate_all(~replace_na(.x, "0 (0%)")) %>%
  mutate(x=paste(x,z, sep="_"), #add variable category for arranging purposes because "unknown" appears multiple times and therefore causes an error when making x a factor class.
         x=factor(x, c("Below 18_Age", "18 to 28_Age", "29 to 38_Age", "Above 38_Age", "Unknown_Age", "Asian_Race", "Black_Race", "White_Race", "Other_Race", "Hispanic_Ethnicity", "Non-Hispanic_Ethnicity", "Unknown_Ethnicity", "Less than high school_Education", "High school diploma or GED_Education", "Some college_Education", "Bachelor or undergraduate degree_Education", "Post-undergraduate degree_Education", "Unknown_Education", "Under $80,000_Income", "$80,000 or more_Income", "Unknown_Income", "Vaginal_deliveryMode", "Cesarean_deliveryMode", "Unknown_deliveryMode"))) %>%
  arrange(x) %>% # order rows
  mutate(x=str_extract(x, ".*(?=_)")) %>% # remove category label now that rows are arranged
  select(x, `Stanford Enriched\nTerm (n=11)`, `Stanford Enriched\nPreterm (n=9)`, `UAB Enriched\nTerm (n=5)`, `UAB Enriched\nPreterm (n=10)`, `MOMS-PI\nTerm (n=90)`, `MOMS-PI\nPreterm (n=43)`, `MOMS-PI\nUnknown (n=98)`) # order columns

demTable <- gaadDF %>%
  rbind(demDF) %>%
  dplyr::rename("  "=x) %>%
  dplyr::rename_at(2:8, ~str_extract(.x, "(?<=\\n).*")) %>%
  kbl(caption = "Table 1. Cohort Demographic Information") %>%
  kable_classic(full_width=TRUE, html_font = "Arial") %>%
  add_header_above(c(" ", "Stanford Enriched" = 2, "UAB Enriched" = 2, "MOMS-PI" = 3)) %>%
  pack_rows("Age", 2, 6) %>%
  pack_rows("Race", 7, 10) %>%
  pack_rows("Ethnicity", 11, 13) %>%
  pack_rows("Education", 14, 19) %>%
  pack_rows("Income", 20, 22) %>%
  pack_rows("Delivery Mode", 23, 25)
demTable
Table 1. Cohort Demographic Information
Stanford Enriched
UAB Enriched
MOMS-PI
Term (n=11) Preterm (n=9) Term (n=5) Preterm (n=10) Term (n=90) Preterm (n=43) Unknown (n=98)
Mean gestational age in weeks at delivery (±SD) 39.2 (1.7) 33.9 (2.3) 37.8 (1.3) 32.7 (3.4) 40 (0.7) 34.1 (3.4)
  • (-)
Age
Below 18 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2 (2%) 1 (2%) 1 (1%)
18 to 28 2 (18%) 0 (0%) 4 (80%) 6 (60%) 57 (63%) 28 (65%) 54 (55%)
29 to 38 4 (36%) 6 (67%) 1 (20%) 3 (30%) 27 (30%) 12 (28%) 39 (40%)
Above 38 0 (0%) 0 (0%) 0 (0%) 0 (0%) 4 (4%) 2 (5%) 2 (2%)
Unknown 5 (45%) 3 (33%) 0 (0%) 1 (10%) 0 (0%) 0 (0%) 2 (2%)
Race
Asian 2 (18%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2 (2%)
Black 0 (0%) 0 (0%) 3 (60%) 8 (80%) 70 (78%) 30 (70%) 56 (57%)
White 6 (55%) 4 (44%) 0 (0%) 1 (10%) 14 (16%) 6 (14%) 30 (31%)
Other 3 (27%) 5 (56%) 2 (40%) 1 (10%) 6 (7%) 7 (16%) 10 (10%)
Ethnicity
Hispanic 3 (27%) 6 (67%) 1 (20%) 0 (0%) 6 (7%) 4 (9%) 4 (4%)
Non-Hispanic 5 (45%) 2 (22%) 4 (80%) 9 (90%) 84 (93%) 39 (91%) 93 (95%)
Unknown 3 (27%) 1 (11%) 0 (0%) 1 (10%) 0 (0%) 0 (0%) 1 (1%)
Education
Less than high school 1 (9%) 3 (33%) 3 (60%) 4 (40%) 6 (7%) 5 (12%) 11 (11%)
High school diploma or GED 1 (9%) 3 (33%) 1 (20%) 3 (30%) 38 (42%) 20 (47%) 27 (28%)
Some college 2 (18%) 1 (11%) 1 (20%) 2 (20%) 23 (26%) 9 (21%) 20 (20%)
Bachelor or undergraduate degree 4 (36%) 1 (11%) 0 (0%) 0 (0%) 17 (19%) 6 (14%) 24 (24%)
Post-undergraduate degree 2 (18%) 1 (11%) 0 (0%) 0 (0%) 4 (4%) 3 (7%) 14 (14%)
Unknown 1 (9%) 0 (0%) 0 (0%) 1 (10%) 2 (2%) 0 (0%) 2 (2%)
Income
Under $80,000 6 (55%) 6 (67%) 5 (100%) 9 (90%) 81 (90%) 40 (93%) 78 (80%)
$80,000 or more 4 (36%) 1 (11%) 0 (0%) 0 (0%) 4 (4%) 1 (2%) 15 (15%)
Unknown 1 (9%) 2 (22%) 0 (0%) 1 (10%) 5 (6%) 2 (5%) 5 (5%)
Delivery Mode
Vaginal 5 (45%) 1 (11%) 5 (100%) 8 (80%) 71 (79%) 38 (88%) 76 (78%)
Cesarean 5 (45%) 8 (89%) 0 (0%) 2 (20%) 16 (18%) 3 (7%) 14 (14%)
Unknown 1 (9%) 0 (0%) 0 (0%) 0 (0%) 3 (3%) 2 (5%) 8 (8%)
#save_kable(demTable, file = file.path(figureOut, paste(Sys.Date(), "Table1_demTable.png", sep="_")), zoom=5)

Set up taxa data frames

Organize and set up tables from MetaPhlAn4 and Gardnerella mapping method. ## MetaPhlAn4 Species relative abundances for Stanford and UAB cohorts:

sumDF <- suMetaphlanPath %>%
  read_tsv(comment = "#") %>%
  filter(str_detect(clade_name, "s__"), !str_detect(clade_name, "t__")) %>%
  mutate(Species = str_extract(clade_name, "(?<=s__)\\w+$")) %>%
  select(Species, everything(), -clade_name) %>%
  dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{10}")) %>%
  column_to_rownames(var="Species") %>%
  t %>%
  as.data.frame() %>%
  mutate_all(funs(./100)) %>%
  rownames_to_column(var="SampleID")
sumDF[1:6,1:5]
##     SampleID Gardnerella_vaginalis Lactobacillus_gasseri Fannyhessea_vaginae
## 1 1000801248             0.5088316             0.4911684           0.0000000
## 2 1000801318             0.3342827             0.6657173           0.0000000
## 3 1000801368             0.4610783             0.5389217           0.0000000
## 4 1001301158             0.5297729             0.0000000           0.1014975
## 5 1001301248             0.5452431             0.0000000           0.0688638
## 6 1001301318             0.7189185             0.0000000           0.0113387
##   Megasphaera_lornae
## 1          0.0000000
## 2          0.0000000
## 3          0.0000000
## 4          0.0806591
## 5          0.1571441
## 6          0.0040567

Sanity check that relative abundances sum to 1

sumDF %>%
  column_to_rownames("SampleID") %>%
  rowSums %>%
  as.data.frame() %>%
  summary
##        .    
##  Min.   :1  
##  1st Qu.:1  
##  Median :1  
##  Mean   :1  
##  3rd Qu.:1  
##  Max.   :1
sumDF %>%
  column_to_rownames("SampleID") %>%
  rowSums %>%
  as.data.frame() %>%
  ggplot(aes(x=`.`)) +
  geom_histogram()

Species relative abundances for MOMS-PI cohort:

hmp2mDF <- hmp2MetaphlanPath %>%
 read_tsv(comment = "#") %>%
  filter(str_detect(clade_name, "s__"), !str_detect(clade_name, "t__")) %>%
  mutate(Species = str_extract(clade_name, "(?<=s__)\\w+$")) %>%
  select(Species, everything(), -clade_name) %>%
  as.data.frame() %>%
  dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{7}")) %>%
  column_to_rownames(var="Species") %>%
  t %>%
  as.data.frame() %>%
  mutate_all(funs(./100)) %>%
  rownames_to_column(var="SampleID") %>%
  filter(SampleID %in% QCfilterSamples) # keep only samples that pass quality filtering
hmp2mDF[1:6,1:5]
##   SampleID Gardnerella_vaginalis Lactobacillus_iners Megasphaera_lornae
## 1  6743909             0.5011474           0.3053081          0.0621420
## 2  6743911             0.0012973           0.6152457          0.0000000
## 3  6743912             0.6630640           0.1090654          0.0192209
## 4  6743914             0.7718984           0.0002261          0.0145733
## 5  6743916             0.5916062           0.1777275          0.0569916
## 6  6743917             0.0005575           0.0042245          0.0000000
##   Prevotella_timonensis
## 1             0.0388053
## 2             0.0358575
## 3             0.0320038
## 4             0.0250850
## 5             0.0113551
## 6             0.0081727

Sanity check that relative abundances sum to 1

hmp2mDF %>%
  column_to_rownames("SampleID") %>%
  rowSums %>%
  as.data.frame() %>%
  summary
##        .         
##  Min.   :0.0000  
##  1st Qu.:1.0000  
##  Median :1.0000  
##  Mean   :0.9987  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
hmp2mDF %>%
  column_to_rownames("SampleID") %>%
  rowSums %>%
  as.data.frame() %>%
  ggplot(aes(x=`.`)) +
  geom_histogram()

Relative abundances of most samples sum to 1.

What is in samples that do not sum to 1?

hmp2mDF %>%
  column_to_rownames("SampleID") %>%
  rowSums %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  filter(`.`<0.99) %>%
  formattable()
SampleID .
6744034 0

Appears to be nothing in sample 6744034.

Gardnerella Mapping

Add relative abundance of individual clades and genomospecies in Stanford and UAB cohort samples, and check for differences in finding Gardnerella by MetaPhlAn4 and mapping method.

Stanford and UAB Clades

suGardCladeAb0 <- suGardCladePath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propClade) %>%
  filter(n>=2) %>% #count as present if >=2 reads are mapped to that clade/genomospecies
  group_by(SampleID) %>%
  mutate(propClade=n/sum(n)) %>%
  ungroup() %>%
  full_join(sumDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propClade*Gardnerella_vaginalis)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
suGardCladeAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 1 × 6
##   SampleID   Clade     n propClade Gardnerella_vaginalis relativeAbundance
##   <chr>      <chr> <dbl>     <dbl>                 <dbl>             <dbl>
## 1 4008435358 C4        2         1                     0                 0
#check if there are any samples where Gard was found by metaphlan but not by mapping method
suGardCladeAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 0 × 6
## # ℹ 6 variables: SampleID <chr>, Clade <chr>, n <dbl>, propClade <dbl>,
## #   Gardnerella_vaginalis <dbl>, relativeAbundance <dbl>

There is one sample where Gardnerella clades were found by mapping method and not by MetaPhlAn4, and no samples where Gardnerella was found by MetaPhlAn4 but not by mapping method

# make table of clade abundances
suGardCladeAb <- suGardCladeAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Clade, relativeAbundance)

Stanford and UAB Genomospecies

suGardGenomoAb0 <- suGardGenomoPath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propGenomospecies) %>%
  filter(n>=2) %>%
  group_by(SampleID) %>%
  mutate(propGenomospecies=n/sum(n)) %>%
  ungroup() %>%
  full_join(sumDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propGenomospecies*Gardnerella_vaginalis)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
suGardGenomoAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 0 × 6
## # ℹ 6 variables: SampleID <chr>, Genomospecies <chr>, n <dbl>,
## #   propGenomospecies <dbl>, Gardnerella_vaginalis <dbl>,
## #   relativeAbundance <dbl>
#check if there are any samples where Gard was found by metaphlan but not by mapping method
suGardGenomoAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 1 × 6
##   SampleID   Genomospecies     n propGenomospecies Gardnerella_vaginalis
##   <chr>      <chr>         <dbl>             <dbl>                 <dbl>
## 1 1061235278 <NA>             NA                NA               0.00910
## # ℹ 1 more variable: relativeAbundance <dbl>

No samples where Gardnerella genomospecies were found by mapping method but not MetaPhlAn4 and one sample where Gardnerella was found by MetaPhlAn4 but not by mapping method.

# make table of genomospecies abundances
suGardGenomoAb <- suGardGenomoAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Genomospecies, relativeAbundance)

MOMS-PI Clades

hmp2GardCladeAb0 <- hmp2GardCladePath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propClade) %>%
  filter(n>=2) %>% # 2+ reads need to map to a clade/genomospecies to be considered present
  group_by(SampleID) %>%
  mutate(propClade=n/sum(n)) %>%
  ungroup() %>%
  full_join(hmp2mDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propClade*Gardnerella_vaginalis) %>%
  filter(SampleID %in% QCfilterSamples)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
hmp2GardCladeAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 75 × 6
##    SampleID Clade     n propClade Gardnerella_vaginalis relativeAbundance
##    <chr>    <chr> <dbl>     <dbl>                 <dbl>             <dbl>
##  1 6744016  C4        2       1                       0                 0
##  2 6744033  C4        3       1                       0                 0
##  3 6744081  C4        5       1                       0                 0
##  4 6744143  C3        6       1                       0                 0
##  5 6744167  C3        2       1                       0                 0
##  6 6744175  C1        2       1                       0                 0
##  7 6744177  C1        2       0.5                     0                 0
##  8 6744177  C2        2       0.5                     0                 0
##  9 6744223  C3        4       0.5                     0                 0
## 10 6744223  C6        4       0.5                     0                 0
## # ℹ 65 more rows
#check if there are any samples where Gard was found by metaphlan but not by mapping method
hmp2GardCladeAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 14 × 6
##    SampleID Clade     n propClade Gardnerella_vaginalis relativeAbundance
##    <chr>    <chr> <dbl>     <dbl>                 <dbl>             <dbl>
##  1 6743917  <NA>     NA        NA             0.000558                 NA
##  2 6744017  <NA>     NA        NA             0.0168                   NA
##  3 6744039  <NA>     NA        NA             0.00161                  NA
##  4 6744091  <NA>     NA        NA             0.00397                  NA
##  5 6744185  <NA>     NA        NA             0.101                    NA
##  6 6744213  <NA>     NA        NA             0.0162                   NA
##  7 6744306  <NA>     NA        NA             0.00376                  NA
##  8 6744425  <NA>     NA        NA             0.000276                 NA
##  9 6744694  <NA>     NA        NA             0.00344                  NA
## 10 6744837  <NA>     NA        NA             0.0000656                NA
## 11 6745043  <NA>     NA        NA             0.00845                  NA
## 12 6745101  <NA>     NA        NA             0.00491                  NA
## 13 6745239  <NA>     NA        NA             0.00364                  NA
## 14 6745280  <NA>     NA        NA             0.00543                  NA

There are 58 samples where Gardnerella was found by MetaPhlan4 but no clades found by mapping method, and 9 samples where Gardnerella was found by mapping method but not by MetaPhlan4

# make abundance table
hmp2GardCladeAb <- hmp2GardCladeAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Clade, relativeAbundance)

MOMS-PI Genomospecies

hmp2GardGenomoAb0 <- hmp2GardGenomoPath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propGenomospecies) %>%
  filter(n>=2) %>% # 2+ reads mapped to each clade/genomospecies to be considered present
  group_by(SampleID) %>%
  mutate(propGenomospecies=n/sum(n)) %>%
  ungroup() %>%
  full_join(hmp2mDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propGenomospecies*Gardnerella_vaginalis)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
hmp2GardGenomoAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 29 × 6
##    SampleID Genomospecies     n propGenomospecies Gardnerella_vaginalis
##    <chr>    <chr>         <dbl>             <dbl>                 <dbl>
##  1 6744016  GS6               2                 1                     0
##  2 6744033  GS6               3                 1                     0
##  3 6744081  GS6               2                 1                     0
##  4 6744143  GS7               2                 1                     0
##  5 6744175  GS1               2                 1                     0
##  6 6744177  GS4               2                 1                     0
##  7 6744223  GS13              3                 1                     0
##  8 6744227  GS14              5                 1                     0
##  9 6744255  GS4               2                 1                     0
## 10 6744299  GS12              3                 1                     0
## # ℹ 19 more rows
## # ℹ 1 more variable: relativeAbundance <dbl>
#check if there are any samples where Gard was found by metaphlan but not by mapping method
hmp2GardGenomoAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 58 × 6
##    SampleID Genomospecies     n propGenomospecies Gardnerella_vaginalis
##    <chr>    <chr>         <dbl>             <dbl>                 <dbl>
##  1 6743917  <NA>             NA                NA              0.000558
##  2 6743926  <NA>             NA                NA              0.00584 
##  3 6743955  <NA>             NA                NA              0.000612
##  4 6744017  <NA>             NA                NA              0.0168  
##  5 6744039  <NA>             NA                NA              0.00161 
##  6 6744042  <NA>             NA                NA              0.0109  
##  7 6744049  <NA>             NA                NA              0.000869
##  8 6744054  <NA>             NA                NA              0.00698 
##  9 6744071  <NA>             NA                NA              0.00970 
## 10 6744091  <NA>             NA                NA              0.00397 
## # ℹ 48 more rows
## # ℹ 1 more variable: relativeAbundance <dbl>

127 samples where Gardnerella was found by MetaPhlan4 but no genomospecies were found by mapping method and 2 samples where Gardnerella was found by mapping method but not by MetaPhlan4

#make abundance table
hmp2GardGenomoAb <- hmp2GardGenomoAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Genomospecies, relativeAbundance)

Further assess uncharacterized samples:

Asses samples where Gardnerella was uncharacterized at the clade level. (Where Gardnerella was found by MetaPhlAn2 but no clades were identified using the mapping method).

unchGardSamples <- hmp2GardCladeAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0) %>%
  .$SampleID

Comparison of number of microbial paired reads and proportion Gardnerella in samples with uncharacterized Gardnerella vs. samples where Gardnerella was found with both methods.

q <- hmp2mDF %>%
  select(SampleID, Gardnerella_vaginalis) %>%
  mutate(unchGard=case_when(SampleID %in% unchGardSamples~TRUE,
                            !(SampleID %in% unchGardSamples)~FALSE)) %>%
  left_join(sgDF1[,c("SampleID", "bbmapFiltered_pairs", "bacLoad")], by="SampleID") %>%
  ggplot(aes(x=Gardnerella_vaginalis, y=bbmapFiltered_pairs, color=unchGard, shape=unchGard)) +
  geom_point(alpha=0.5) +
  theme(legend.position = "bottom") +
  binaryColors +
  labs(x="Proportion Gardnerella", y="Microbial Reads", color="Uncharacterized Gardnerella", shape="Uncharacterized Gardnerella")

ggMarginal(q, groupFill = TRUE, type="density")

Samples with uncharacterized Gardnerella have fewer microbial reads and a lower proportion of Gardnerella

Join abundance tables

# Gardnerella abundance
gardCladeAb0 <- full_join(suGardCladeAb, hmp2GardCladeAb, by = c("SampleID", "Clade", "relativeAbundance"))
gardGenomoAb0 <- full_join(suGardGenomoAb, hmp2GardGenomoAb, by = c("SampleID", "Genomospecies", "relativeAbundance"))

# metaphlan
mDF0 <- sumDF %>%
  full_join(hmp2mDF)

#number of speacies that appear in only one vs. both samples
testOverlap <- summarise_all(mDF0, anyNA) %>%
  gather("Species", "anyNA", 2:ncol(.)) %>%
  select(-SampleID)

nrow(testOverlap)
## [1] 517
table(testOverlap$anyNA)
## 
## FALSE  TRUE 
##   294   223

364 total species and 209 appear in both studies (Stanford/UAB vs. MOMS-PI). Number of species in each study:

ncol(sumDF)-1
## [1] 326
ncol(hmp2mDF)-1
## [1] 485

Replace NAs with zeros for abundance values for species that did not appear in each respective study.

mDF1 <- mDF0 %>%
  mutate_if(is.numeric, ~replace_na(.x, 0)) %>%
  left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
  select(SampleID, Cohort, everything())
anyNA(mDF1) # check for remaining NAs
## [1] FALSE

Select enriched Gardnerella MOMS-PI subset

Proportion *Gardnerella* in samples by study:
mDF1 %>%
  select(SampleID, Cohort, Gardnerella_vaginalis) %>%
  mutate(Cohort=case_when(Cohort=="Stanford Enriched"|Cohort=="UAB Enriched"~"Stanford & UAB Enriched",
                          Cohort=="MOMS-PI"~"MOMS-PI"),
         Cohort=factor(Cohort, levels = c("Stanford & UAB Enriched", "MOMS-PI"))) %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort, group=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  labs(x="Proportion Gardnerella")

gardAbundance2Cohort <- mDF1 %>%
  select(SampleID, Cohort, Gardnerella_vaginalis) %>%
  left_join(sgDF1[,c("SampleID", "SubjectID")], by="SampleID") %>%
  mutate(Cohort=case_when(Cohort=="Stanford Enriched"|Cohort=="UAB Enriched"~"Stanford & UAB Enriched",
                          Cohort=="MOMS-PI"~"MOMS-PI"),
         Cohort=factor(Cohort, levels = c("Stanford & UAB Enriched", "MOMS-PI")))
gardAbundance2Cohort %>%
  group_by(Cohort) %>%
  summarize(min=min(Gardnerella_vaginalis),
            `1stQu`=summary(Gardnerella_vaginalis)[2],
            median=median(Gardnerella_vaginalis),
            mean=mean(Gardnerella_vaginalis),
            `3rdQu`=summary(Gardnerella_vaginalis)[5],
            max=max(Gardnerella_vaginalis)) %>%
  formattable()
Cohort min 1stQu median mean 3rdQu max
Stanford & UAB Enriched 0 0.184295 0.4610783 0.4378226 0.6828045 0.9791351
MOMS-PI 0 0.000000 0.0540306 0.2327505 0.4664893 1.0000000

Proportion Gardnerella by as subject averages by study:

subjectGardAbundance2Cohort <- gardAbundance2Cohort %>%
  group_by(SubjectID, Cohort) %>%
  summarize(Gardnerella_vaginalis=mean(Gardnerella_vaginalis)) %>%
  ungroup
  
subjectAvgPropGard <- subjectGardAbundance2Cohort %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  theme(legend.position = "bottom", 
        legend.text = element_text(size=10), 
        legend.title = element_text(size=11)) +
  labs(x="Subject Average Proportion Gardnerella")
subjectAvgPropGard

MGS sequenced samples from Stanford and UAB were selected based on enrichment of Gardnerella at the subject level. Create a cohort of a subset of samples in the MOMS-PI cohort that are also selected for enrichment of Gardnerella at the subject level for better comparison.

ntiles <- subjectGardAbundance2Cohort %>%
  filter(Cohort=="Stanford & UAB Enriched") %>%
  mutate(decile=ntile(Gardnerella_vaginalis, 3)) %>%
  group_by(decile) %>%
  filter(Gardnerella_vaginalis==min(Gardnerella_vaginalis)) %>%
  .$Gardnerella_vaginalis %>%
  sort %>%
  unique
ntiles
## [1] 0.0000000 0.3384204 0.6256335
# subjectGardAbundance2Cohort0 <- subjectGardAbundance2Cohort %>%
#   filter(Cohort=="MOMS-PI",
#          SubjectID %in% hmp2SubjectsWDel) %>% # *** For selecting subjects that have gestational age at delivery  ***
#   mutate(Ntile=case_when(between(Gardnerella_vaginalis, ntiles[1], ntiles[2])~1,
#                         between(Gardnerella_vaginalis, ntiles[2], ntiles[3])~2,
#                         Gardnerella_vaginalis > ntiles[3]~3))
# 
# subjN <- min(count(subjectGardAbundance2Cohort0, Ntile)$n) # number of subjects from each ntile to take
# 
# HMP2highGardSubjects <- subjectGardAbundance2Cohort0 %>% 
#   group_by(Ntile) %>%
#   sample_n(subjN, replace = FALSE) %>%
#   .$SubjectID
#  
# write_lines(HMP2highGardSubjects, "./HMP2highGardSubjects.txt")
HMP2highGardSubjects <- read_lines("./HMP2highGardSubjects.txt")

Figure S5: Distributions of subject average Gardnerella abundances

#fig.width=5, fig.height=2.5
subjectAvgPropGardEnr <- subjectGardAbundance2Cohort %>%
  filter(Cohort=="Stanford & UAB Enriched"|(Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects)) %>%
  mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort, group=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  theme(legend.position = "bottom", 
        legend.text = element_text(size=10), 
        legend.title = element_text(size=11)) +
  labs(x="Subject Average Proportion Gardnerella")

plot_grid(subjectAvgPropGard, subjectAvgPropGardEnr, nrow=1, labels = c("A", "B"), label_size = 15)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS5_enrichedSubjectPropGardbyStudy.png", sep="_")))
subjectGardAbundance2Cohort %>%
  group_by(Cohort) %>%
  summarize(min=min(Gardnerella_vaginalis),
            `1stQu`=summary(Gardnerella_vaginalis)[2],
            median=median(Gardnerella_vaginalis),
            mean=mean(Gardnerella_vaginalis),
            `3rdQu`=summary(Gardnerella_vaginalis)[5],
            max=max(Gardnerella_vaginalis)) %>%
  formattable()
Cohort min 1stQu median mean 3rdQu max
Stanford & UAB Enriched 0 0.27220362 0.4359771 0.4407596 0.6436723 0.9160428
MOMS-PI 0 0.01085372 0.1437913 0.2320526 0.4024642 0.9962414
subjectGardAbundance2Cohort %>%
  filter((Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects)) %>%
  mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
  group_by(Cohort) %>%
  summarize(min=min(Gardnerella_vaginalis),
            `1stQu`=summary(Gardnerella_vaginalis)[2],
            median=median(Gardnerella_vaginalis),
            mean=mean(Gardnerella_vaginalis),
            `3rdQu`=summary(Gardnerella_vaginalis)[5],
            max=max(Gardnerella_vaginalis)) %>%
  formattable()
Cohort min 1stQu median mean 3rdQu max
MOMS-PI Enriched 0 0.1950071 0.3913007 0.415446 0.6580644 0.8334826

Distribution of Gardnerella abundnace in samples

HMP2HighGardSamples <- sgDF1 %>%
  filter(SubjectID %in% HMP2highGardSubjects) %>%
  .$SampleID

gardAbundance2Cohort %>%
  filter(Cohort=="Stanford & UAB Enriched"|Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects) %>%
  mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  labs(x="Proportion of Gardnerella")

Add enriched MOMS-PI cohort to tables Sample and subject metadata:

# extract enriched Gardnerella subset and full join to metadata 
highGardSgDF <- sgDF1 %>%
  filter(SubjectID %in% HMP2highGardSubjects) %>% 
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"),
         SubjectID=paste0(SubjectID, "_E"))

nrow(highGardSgDF)  
## [1] 145
sgDF <- sgDF1 %>%
  mutate(Cohort=as.vector(Cohort),
         SubjectID=as.character(SubjectID)) %>%
  full_join(highGardSgDF, by=colnames(sgDF1)) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts))

bacLoad <- sgDF %>%
  select(SampleID, bacLoad)

sgDF %>%
  group_by(Cohort) %>%
  summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
  formattable()
Cohort N Subjects N Samples
Stanford Enriched 20 62
UAB Enriched 15 45
MOMS-PI Enriched 42 145
MOMS-PI 231 781

MetaPhlAn4 species abundance tables:

highGardmDF <- mDF1 %>%
  filter(SampleID %in% HMP2HighGardSamples) %>%
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"))

mDF <- mDF1 %>%
  mutate(Cohort=as.vector(Cohort)) %>%
  rbind(highGardmDF) %>%
  mutate(Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI Enriched", "MOMS-PI")))

Gardnerella clade and genomospecies abundance tables:

# Clade Abundances
gardCladeAb1 <- gardCladeAb0 %>%
  left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
  mutate(Cohort=as.vector(Cohort))
 
highGardCladeAb <- gardCladeAb1 %>%
  filter(SampleID %in% HMP2HighGardSamples) %>%
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"))

gardCladeAb <- gardCladeAb1 %>%
  rbind(highGardCladeAb) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts))

# Gemomospecies Abundances
gardGenomoAb1 <- gardGenomoAb0 %>%
  left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
  mutate(Cohort=as.vector(Cohort))
 
highGardGenomoAb <- gardGenomoAb1 %>%
  filter(SampleID %in% HMP2HighGardSamples) %>%
  mutate(Cohort="MOMS-PI Enriched", 
         SampleID=paste0(SampleID, "_E"))

gardGenomoAb <- gardGenomoAb1 %>%
  rbind(highGardGenomoAb) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts))

# note these tables contain samples with at least one identified clade or genomospecies of gardnerella

Top taxa

avgTop20Abundance0 <- mDF %>%
  select(SampleID, Cohort, everything()) %>%
  gather("Species", "Abundance", 3:ncol(.)) %>%
  group_by(Cohort, Species) %>%
  summarise(avgAbundance=mean(Abundance)) 

top20taxa <- mDF %>%
  select(SampleID, Cohort, everything()) %>%
  gather("Species", "Abundance", 3:ncol(.)) %>%
  with_groups(Species, summarize, avgAbundance=mean(Abundance)) %>%
  top_n(20, avgAbundance) %>%
  .$Species

avgTop20Abundance0 %>%
  spread(Cohort, avgAbundance) %>%
  mutate(stanfordRank=rank(-`Stanford Enriched`),
         uabRank=rank(-`UAB Enriched`), 
         momspiRank=rank(-`MOMS-PI`),
         momspiERank=rank(-`MOMS-PI Enriched`),
         Stanford=round(`Stanford Enriched`, 2),
         UAB=round(`UAB Enriched`,2),
         `MOMS-PI`=round(`MOMS-PI`, 2),
         `MOMS-PI Enriched`=round(`MOMS-PI Enriched`, 2)) %>%
  filter(Species %in% top20taxa) %>%
  select(Species, `Stanford Enriched`, stanfordRank, `UAB Enriched`, uabRank, `MOMS-PI Enriched`, momspiERank, `MOMS-PI`, momspiRank) %>%
  arrange(stanfordRank) %>%
  formattable()
Species Stanford Enriched stanfordRank UAB Enriched uabRank MOMS-PI Enriched momspiERank MOMS-PI momspiRank
Gardnerella_vaginalis 0.371412653 1 5.293208e-01 1.0 0.41 1 0.23 2
Lactobacillus_crispatus 0.141630568 2 2.990035e-02 4.0 0.06 3 0.17 3
Lactobacillus_iners 0.136737252 3 1.457875e-01 2.0 0.20 2 0.30 1
Lactobacillus_gasseri 0.075189324 4 2.864444e-06 201.0 0.01 12 0.04 5
Lactobacillus_jensenii 0.054525440 5 4.990044e-04 56.0 0.04 5 0.05 4
Fannyhessea_vaginae 0.041150944 6 6.386470e-02 3.0 0.04 4 0.02 6
Megasphaera_lornae 0.029528485 7 2.731367e-02 5.0 0.03 6 0.02 7
GGB3012_SGB4003 0.009627750 8 1.280285e-02 7.0 0.01 10 0.01 15
Coriobacteriales_bacterium_DNF00809 0.009062076 9 9.327858e-03 13.0 0.00 23 0.00 20
Prevotella_timonensis 0.008927881 10 7.786953e-03 15.0 0.02 9 0.01 9
Ureaplasma_parvum 0.008162684 11 1.560520e-03 34.0 0.00 30 0.01 13
Prevotella_bivia 0.007158061 12 1.758066e-02 6.0 0.00 17 0.01 17
Mycoplasma_hominis 0.006578639 13 9.944680e-03 11.0 0.01 14 0.01 11
Prevotella_amnii 0.004770915 16 9.524909e-03 12.0 0.01 13 0.01 14
GGB753_SGB989 0.004604540 18 1.104966e-02 10.0 0.01 16 0.00 21
Sneathia_vaginalis 0.004111276 20 1.192308e-02 8.0 0.01 11 0.01 10
Veillonellaceae_bacterium_DNF00626 0.004078663 21 3.235882e-03 22.0 0.01 15 0.01 16
Dialister_micraerophilus 0.002333300 28 2.236473e-03 27.0 0.00 18 0.00 19
Bifidobacteriaceae_bacterium_NR047 0.000000000 389 1.126034e-02 9.0 0.02 8 0.01 12
Bradyrhizobium_viridifuturi 0.000000000 389 0.000000e+00 377.5 0.03 7 0.02 8
x <- mDF %>%
  select(SampleID, Cohort, top20taxa) %>%
  mutate_if(is.numeric, ~case_when(.x>0~1,
                                   .x==0~0))%>%
  group_by(Cohort) %>%
  summarize_if(is.numeric, ~sum(.x)/n()) %>%
  data.frame()
rownames(x) <- x$Cohort
x %>%
  select(-Cohort) %>%
  t %>%
  as.data.frame() %>%
  rownames_to_column("Species") %>%
  arrange(-`Stanford Enriched`) %>%
  formattable()
Species Stanford Enriched UAB Enriched MOMS-PI Enriched MOMS-PI
Gardnerella_vaginalis 0.8548387 0.95555556 0.88275862 0.7323944
Fannyhessea_vaginae 0.5645161 0.82222222 0.67586207 0.4212548
Dialister_micraerophilus 0.5161290 0.80000000 0.57241379 0.3610755
Prevotella_timonensis 0.5161290 0.68888889 0.48965517 0.3674776
Lactobacillus_iners 0.4516129 0.91111111 0.88965517 0.7989757
Veillonellaceae_bacterium_DNF00626 0.4354839 0.68888889 0.59310345 0.3469910
Prevotella_bivia 0.3709677 0.51111111 0.15172414 0.1728553
Lactobacillus_gasseri 0.3548387 0.04444444 0.06896552 0.1408451
Megasphaera_lornae 0.3387097 0.80000000 0.63448276 0.3982074
Ureaplasma_parvum 0.3387097 0.42222222 0.31034483 0.3265045
Coriobacteriales_bacterium_DNF00809 0.2903226 0.71111111 0.47586207 0.2650448
GGB753_SGB989 0.2903226 0.68888889 0.51034483 0.2394366
Lactobacillus_crispatus 0.2741935 0.08888889 0.24827586 0.3201024
Sneathia_vaginalis 0.2741935 0.66666667 0.44827586 0.2535211
Prevotella_amnii 0.2580645 0.42222222 0.40000000 0.2304738
GGB3012_SGB4003 0.2419355 0.57777778 0.51724138 0.2419974
Lactobacillus_jensenii 0.2096774 0.13333333 0.18620690 0.2816901
Mycoplasma_hominis 0.1129032 0.66666667 0.29655172 0.1779770
Bifidobacteriaceae_bacterium_NR047 0.0000000 0.28888889 0.20000000 0.1395647
Bradyrhizobium_viridifuturi 0.0000000 0.00000000 0.51034483 0.3265045

Create abundance table with taxa of interest

# abundance table of taxa of interest with clades
mDFtoiClades <- gardCladeAb %>%
  spread(Clade, relativeAbundance) %>%
  full_join(mDF, by=c("SampleID", "Cohort")) %>%
  select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobes) %>% # keep total Gardnerella abundance in table for now. Many operations are taxon vs. taxon. Will need to remove this column when analyses call for it.
  mutate_at(vars(c(Gardnerella_vaginalis, clades, lactos, anaerobes)), replace_na, 0) %>%
  mutate(Cohort=factor(Cohort, levels=cohorts))

Gardnerella variant abundance

Clades per sample

gardCladeAbU <- gardCladeAb %>%
  full_join(mDF[,c("SampleID", "Cohort", "Gardnerella_vaginalis")], by=c("SampleID","Cohort")) %>%
  spread(Clade, relativeAbundance) %>%
  select(-`<NA>`) %>% # remove "NA column" created by samples that have zero Gardnerella
  mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>=0&is.na(C1)&is.na(C2)&is.na(C3)&is.na(C4)&is.na(C5)&is.na(C6)~Gardnerella_vaginalis,
                                               Gardnerella_vaginalis>0&!is.na(clades)~0)) %>%
  gather("var", "relativeAbundance", c(clades, Gardnerella_vaginalis, Uncharacterized_Gardnerella)) %>%
  replace_na(list(relativeAbundance=0))

Average clades per sample in each cohorts:

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(Cohort, summarize, mean=mean(nClades), `standard deviation`=sd(nClades), median=median(nClades)) %>%
  formattable
Cohort mean standard deviation median
Stanford Enriched 3.016129 1.894703 3
UAB Enriched 4.777778 1.593864 5
MOMS-PI Enriched 3.717241 1.978016 4
MOMS-PI 2.571063 2.162414 2

Average clades per sample across all cohorts:

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  summarize(mean=mean(nClades), `standard deviation`=sd(nClades), median=median(nClades)) %>%
  formattable
mean standard deviation median
2.854792 2.174602 3
cladesPerSample <- gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  count(Cohort, nClades) %>%
  group_by(Cohort) %>%
  mutate(n=n/sum(n)) %>%
  ggplot(aes(x=nClades, y=n)) +
  geom_bar(stat="identity", fill = "black") +
  facet_wrap(~Cohort) +
  labs(x="Clades per Sample", y="Proportion of Samples", fill=bquote('Uncharacterized'~italic(Gardnerella))) +
  theme(axis.text.x = element_text(size=9),
        strip.text = element_text(size=13),
        aspect.ratio = 0.5)

Genomospecies per sample

gardGenomoAbU <- gardGenomoAb %>%
  spread(Genomospecies, relativeAbundance) %>%
  full_join(mDF[,c("SampleID", "Cohort", "Gardnerella_vaginalis")], by=c("SampleID", "Cohort")) %>%
  mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>=0&is.na(GS1)&is.na(GS2)&is.na(GS3)&is.na(GS4)&is.na(GS5)&is.na(GS6)&is.na(GS7)&is.na(GS8)&is.na(GS9)&is.na(GS10)&is.na(GS11)&is.na(GS12)&is.na(GS13)&is.na(GS14)~Gardnerella_vaginalis,
                                               Gardnerella_vaginalis>0&!is.na(genomos)~0)) %>%
  gather("var", "relativeAbundance", c(genomos, Gardnerella_vaginalis, Uncharacterized_Gardnerella)) %>%
  replace_na(list(relativeAbundance=0))

Average genomospecies per sample in each cohorts:

 gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  with_groups(Cohort, summarize, mean=mean(nGenomos), `standard deviation`=sd(nGenomos), median=median(nGenomos)) %>%
  formattable
Cohort mean standard deviation median
Stanford Enriched 4.016129 3.226387 4
UAB Enriched 9.177778 4.339506 9
MOMS-PI Enriched 5.344828 3.853893 5
MOMS-PI 3.348271 3.776321 2

Number of samples with all 14 genomospecies

 gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  count(Cohort, nGenomos) %>%
  group_by(Cohort) %>%
  mutate(pcnt=n/sum(n)*100) %>% # convert to fraction of sample
  filter(nGenomos==14) %>%
  formattable
Cohort nGenomos n pcnt
Stanford Enriched 14 1 1.6129032
UAB Enriched 14 12 26.6666667
MOMS-PI Enriched 14 1 0.6896552
MOMS-PI 14 6 0.7682458

Average genomospecies per sample across all cohorts:

 gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  summarize(mean=mean(nGenomos), `standard deviation`=sd(nGenomos), median=median(nGenomos)) %>%
  formattable
mean standard deviation median
3.922556 4.000945 3
genomosPerSample <- gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  count(Cohort, nGenomos) %>%
  group_by(Cohort) %>%
  mutate(n=n/sum(n), # convert to fraction of sample
         nGenomos=factor(nGenomos, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"))) %>% 
  ggplot(aes(x=nGenomos, y=n)) +
  geom_bar(stat="identity", fill = "black") +
  facet_wrap(~Cohort) +
  labs(x="Species per Sample", y="Proportion of Samples") +
  theme(axis.text.x = element_text(size=9),
        strip.text = element_text(size=13),
        aspect.ratio = 0.5)

Variant Relative abundance

Clades

Calculate clade prevalences

prevTable <- gardCladeAbU %>%
  with_groups(c(Cohort, var), summarize, p = sum(relativeAbundance > 0), n = n(), m=mean(relativeAbundance), sd=sd(relativeAbundance)) %>% 
  mutate(prevalence=round(p/n, 2),
         var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella",  "Gardnerella_vaginalis"), labels=c(clades, "Unch.", "Total")))

Sample relative abundances with prevalence

gardCladeAbU %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  mutate(var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels=c(clades, "Total"))) %>%
  left_join(prevTable[,c("Cohort", "var", "prevalence")], by=c("Cohort", "var")) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=relativeAbundance, color=var, fill=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(aes(y=1.2, x=var, label=prevalence), color="black", size=5) +
  scale_y_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1, 1.2), limits = c(0,1.3), labels=c("0.00", "0.25", "0.50", "0.75", "1.00", "prevalence")) +
  cladeColorsTotal +
  facet_grid(Cohort~.) +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 16),
        axis.text.y = element_text(size=16),
        axis.title.y = element_text(size=23),
        strip.text = element_text(size=17)) +
  labs(x=NULL, y="Relative Abundance")

Clades differ in abundance and abundance of clades varies across cohorts. Clade 3 is less abundant in Stanford cohort but abundant in other cohorts, especially UAB. Clade 1 is more abundant in Stanford. Clade 6 is rare in Stanford but appears in UAB and MOMS-PI cohorts.

Prevalence vs relative abundance of Gardnerella clades

cladeAbvPrev <- prevTable %>%
  filter(var!="Unch.") %>%
  ggplot(aes(x=prevalence, y=m, color=var)) +
  geom_point(size=3, alpha=0.75) +
  scale_x_continuous(limits=c(0,1), breaks=seq(0,1,0.2)) +
  scale_y_continuous(limits =c(0,1), breaks=seq(0,1,0.2)) +
  facet_wrap(~Cohort) +
  cladeColorsTotal +
  coord_fixed() +
  theme(legend.position = "right",
        strip.text = element_text(size=12)) +
  labs(x="Prevalence", y="Mean Relative Abundance", color="Clade")

Genomospecies

Calculate genomospecies prevalences

genomosPrevTable <- gardGenomoAbU %>%
  with_groups(c(Cohort, var), summarize, p = sum(relativeAbundance > 0), n = n(), m=mean(relativeAbundance), sd=sd(relativeAbundance)) %>% 
  mutate(prevalence=round(p/n, 2))

Sample relative abundance with prevalence

gardGenomoAbU %>%
  left_join(genomosPrevTable[,c("Cohort", "var", "prevalence")], by=c("Cohort", "var")) %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  mutate(var=factor(var, levels=c(genomosCladeOrder, "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total")),
         Cohort=factor(Cohort, levels=cohorts, labels = cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=relativeAbundance, color=var, fill=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(aes(y=1.2, x=var, label=prevalence), color="black", size=5) +
  scale_y_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1, 1.2), limits = c(0,1.3), labels=c("0.00", "0.25", "0.50", "0.75", "1.00", "prevalence")) +
  facet_grid(Cohort~.) +
  genomoColorsTotal +
  theme(axis.text.x = element_text(size=11),
        axis.text.y = element_text(size=12),
        axis.title.y = element_text(size=20),
        strip.text = element_text(size = 11),
        legend.position="none") +
  labs(x=NULL, y="Relative Abundance")

Prevalence vs relative abundance of Gardnerella genomospecies

genomoAbvPrev <- gardGenomoAbU %>%
  group_by(Cohort, var) %>%
  summarise(avgAbundance=mean(relativeAbundance)) %>%
  left_join(genomosPrevTable, by=c("Cohort", "var")) %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  dplyr::rename(Genomospecies=var) %>%
  mutate(Genomospecies=factor(Genomospecies, levels = c(genomosCladeOrder, "Gardnerella_vaginalis"))) %>%
  ggplot(aes(x=prevalence, y=avgAbundance, color=Genomospecies)) +
  geom_point(size=3, alpha=0.75) +
  scale_x_continuous(limits=c(0,1), breaks=seq(0,1,0.2)) +
  scale_y_continuous(limits =c(0,1), breaks=seq(0,1,0.2)) +
  genomoColorsTotal +
  facet_wrap(~Cohort) +
  coord_fixed() +
  theme(legend.position = "right", 
        strip.text = element_text(size=12)) +
  labs(x="Prevalence", y="Mean Relative Abundance", color="Species")

Figure 3: Prevalence and abundance of Gardnerella variants across the four cohorts

#fig.width = 6, fig.height=4
varsPerSample <- plot_grid(cladesPerSample, genomosPerSample, nrow = 1, labels = c("A", "B"), label_size = 15)
gardVarAbun <- plot_grid(cladeAbvPrev, genomoAbvPrev, nrow = 1, labels = c("C", "D"), label_size = 15)
plot_grid(varsPerSample, gardVarAbun, nrow = 2, axis = "l")

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "Figure3_cladeGenomoPrevAbun.png", sep="_")))

Microbial Load

Compare microbial load across the four cohorts:

# table of descriptive statistics with outliers
sgDF %>%
  group_by(Cohort) %>%
  summarise_at("bacLoad", list(min=min, mean=mean, median=median, max=max)) %>%
  mutate_if(is.numeric, ~round(.x, 4)) %>%
  formattable()
Cohort min mean median max
Stanford Enriched 0.0035 2.5594 0.0504 150.0600
UAB Enriched 0.0036 0.4658 0.2504 6.0354
MOMS-PI Enriched 0.0111 0.2492 0.1259 4.7932
MOMS-PI 0.0038 0.2074 0.0446 47.6365
# microbial load of outliers
sgDF %>%
  filter(bacLoad>2) %>%
  select(SampleID, Cohort, bacLoad) %>%
  formattable
SampleID Cohort bacLoad
4009835178 UAB Enriched 6.035434
1005601348 Stanford Enriched 150.059967
6744677 MOMS-PI 4.103960
6743991 MOMS-PI 12.019563
6744354 MOMS-PI 47.636472
6744871 MOMS-PI 4.793229
6744459 MOMS-PI 4.739440
6744677_E MOMS-PI Enriched 4.103960
6744871_E MOMS-PI Enriched 4.793229
#Figure S6
# density plot without outliers (bacload ≥ 2)
sgDF %>%
  filter(bacLoad<2) %>%
  mutate(Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=bacLoad, fill=Cohort, color=Cohort)) +
  geom_density(alpha=0.5) +
  quadColors2 +
  theme(legend.position = c(0.85,0.8)) +
  labs(x="Microbial Load")

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS6_bacLoadDensNoOut.png", sep="_")))


# table of descriptive statistics without outliers
sgDF %>%
  filter(bacLoad<2) %>%
  group_by(Cohort) %>%
  summarise_at("bacLoad", list(min=min, mean=mean, median=median, max=max)) %>%
  mutate_if(is.numeric, ~round(.x, 4)) %>%
  formattable()
Cohort min mean median max
Stanford Enriched 0.0035 0.1414 0.0498 1.0077
UAB Enriched 0.0036 0.3392 0.2417 1.2209
MOMS-PI Enriched 0.0111 0.1905 0.1201 1.4112
MOMS-PI 0.0038 0.1143 0.0441 1.8656

Absolute abundance

Clade aboslute abundance

All samples absolute abundance

gardCladeAbU %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad, 
         var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(clades, "Unch.", "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=absoluteAbundance, color=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  cladeColorsTotalUnch +
  scale_y_continuous(limits=c(0,1), n.breaks = 5) +
  facet_grid(Cohort~.) +
  theme(legend.position = "none",
        axis.text.x = element_text(size=16),
        axis.text.y = element_text(size=16),
        axis.title.y = element_text(size=23),
        strip.text = element_text(size=17)) +
  labs(x=NULL, y="Absolute Abundance")

Clades 3 and 4 appear to show the greatest absolute abundance.

Genomospecies aboslute abundance

All samples absolute abundance

gardGenomoAbU %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad, 
         var=factor(var, levels=c(genomosCladeOrder, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Unch.", "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels = cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=absoluteAbundance, color=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  facet_grid(Cohort~.) +  
  genomoColorsTotalUnch +
  theme(axis.text.x = element_text(size=11),
        axis.text.y = element_text(size=12),
        axis.title.y = element_text(size=20),
        strip.text = element_text(size = 11),
        legend.position="none") +
  labs(x=NULL, y="Absolute Abundance")

Genomospecies G. swidsinskii and genomospecies 7 appear to show the greatest absolute abundance.

Thresholds for presence-absence

Determine what threshold will be used to count a taxon as present vs. absent for co-occurrence and presence absence vs. microbial load analyses

MetaPhlAn will report all taxa that have 10% of marker genes non-zero based on this Google group response from Nicola Segata. Average number of marker genes per bacterial species is 184 ± 45. (Truong et al., 2015, Nature Mathods “MetaPhlAn4 for enhanced metagenomic taxonomic profiling”). About 1 million markers from >7,500 species. Minimum number of reads to be detected would be 18.4.

sgDF1 %>%
  mutate(ntile=ntile(bbmapFiltered_pairs, n=20)) %>%
  filter(ntile==5) %>%
  filter(bbmapFiltered_pairs==min(bbmapFiltered_pairs)) %>%
  .$bbmapFiltered_pairs
## [1] 104260
sgDF1 %>%
  mutate(over30K=bbmapFiltered_pairs>100000) %>%
  count(over30K) %>% 
  mutate(freq=n/sum(n))
## # A tibble: 2 × 3
##   over30K     n  freq
##   <lgl>   <int> <dbl>
## 1 FALSE     165 0.186
## 2 TRUE      723 0.814

80% of samples have greater than 100,000 read pairs. MetaPhlAn uses about 10% of the reads in the library, and 18.4/10,000 yields a functional limit of detection of roughly 0.1%.

18.4/10000 
## [1] 0.00184

A threshold of 0.1% has been used frequently in presence-absence analyses in microbiome data and will be used as a threshold here.

Gardnerella and microbial load

Microbial load by number of clades

loadVClades <- gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  gather("var", "abundance", clades) %>%
  group_by(SampleID, Cohort) %>%
  summarize(nClades=sum(abundance)) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=bacLoad)) +
  geom_jitter(alpha=0.25) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 1.5, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Microbial Load")
loadVClades

Is this a function of sequencing depth?

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  gather("var", "abundance", clades) %>%
  group_by(SampleID, Cohort) %>%
  summarize(nClades=sum(abundance)) %>%
  left_join(sgDF[,c("SampleID", "bacLoad", "Sickle_pairs")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=Sickle_pairs)) +
  geom_jitter(alpha=0.5) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 6e07, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Library Size") # library size is QC-filtered reads
## `summarise()` has grouped output by 'SampleID'. You can override using the
## `.groups` argument.
## `geom_smooth()` using formula = 'y ~ x'

Microbial Reads

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  gather("var", "abundance", clades) %>%
  group_by(SampleID, Cohort) %>%
  summarize(nClades=sum(abundance)) %>%
  left_join(sgDF[,c("SampleID", "bacLoad", "bbmapFiltered_pairs")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=bbmapFiltered_pairs)) +
  geom_jitter(alpha=0.5) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 2e07, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Microbial Reads")

Assess microbial load by clades per sample after rarefying

allUnchGardSamples <- gardCladeAbU %>%
  filter(var=="Uncharacterized_Gardnerella",
         relativeAbundance>0) %>%
  .$SampleID
#write_lines(allUnchGardSamples, "/Users/hlberman/Desktop/unchGardSamples.txt")

sgDF %>%
  mutate(isUnch=case_when(SampleID %in% allUnchGardSamples~TRUE,
                          SampleID %!in% allUnchGardSamples~FALSE)) %>%
  ggplot(aes(x=isUnch, y=bbmapFiltered_pairs, color=Cohort, shape=Cohort)) +
  geom_boxplot(alpha=0, position = position_dodge(width=1)) +
  geom_quasirandom(alpha=0.5, dodge.width = 1) +
  geom_hline(yintercept = 1e05, linetype=2, color="black") +
  geom_hline(yintercept = 16334, linetype=2, color="gray") +
  scale_y_log10() +
  labs(x="Uncharacterized Gardnerella", y="Microbial Reads")

minReads <- min(sgDF$bbmapFiltered_pairs) 
minReads
## [1] 16334

Rarefy to 100,000 reads per sample

Assess read counts and get samples with 100,000 reads

rarefiedReadCountsDF <- "./rarefied_gard_counts/rarefiedReadCounts.csv" %>%
  read_csv 

k100Samples0 <- rarefiedReadCountsDF %>%
  filter(rarefiedReads==100000) %>%
  .$SampleID

# add names for samples in MOMS-PI Enriched
k100Samples <- k100Samples0[k100Samples0 %in% HMP2HighGardSamples] %>%
  paste0(., "_E") %>%
  c(k100Samples0, .)

Import MetaPhlAn tables

rarefiedMetaphlanOuts <- "./rarefied_gard_counts/rarefiedMergedMetaphlan4Abundance.tsv" %>%
  read_tsv(comment = "#") %>%
  filter(str_detect(clade_name, "s__"), !str_detect(clade_name, "t__")) %>%
  mutate(Species = str_extract(clade_name, "(?<=s__)\\w+$")) %>%
  select(Species, everything(), -clade_name) %>%
  column_to_rownames(var="Species") %>%
  t %>%
  as.data.frame() %>%
  mutate_all(funs(./100)) %>%
  rownames_to_column(var="SampleID") %>%
  mutate(SampleID=str_extract(SampleID, ".*(?=_metaphlan_bugs_list)"), 
         SampleID=case_when(!str_detect(SampleID, "SRR")~SampleID,
                            str_detect(SampleID, "SRR")~str_extract(SampleID, "(?<=SRR)[0-9]*")))
  
rarefiedGardProportions <- rarefiedMetaphlanOuts %>%
   select(SampleID, Gardnerella_vaginalis) %>%
   replace_na(list(Gardnerella_vaginalis=0))

Proportion of clades

rarefiedCladeRelativeAbundance <- "./rarefied_gard_counts/gardCladeRelativeAbundance.tsv" %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propClade) %>%
  filter(n>=2) %>% #count as present if >=2 reads are mapped to that clade
  group_by(SampleID) %>%
  mutate(propClade=n/sum(n)) %>%
  ungroup() %>%
  select(-n)
rarefiedGardDF0 <- rarefiedCladeRelativeAbundance %>%
  full_join(rarefiedGardProportions) %>%
  mutate(relativeAbundance=propClade*Gardnerella_vaginalis) %>%
  select(-propClade) %>%
  spread(Clade, relativeAbundance) %>%
  select(-`<NA>`) %>%
  replace_na(list(C1=0, C2=0, C3=0, C4=0, C5=0, C6=0)) %>%
  right_join(sgDF[,c("SampleID", "SubjectID", "Cohort", "TermPre")]) %>%
  mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>0&(C1+C2+C3+C4+C5+C6==0)~Gardnerella_vaginalis,
                                               Gardnerella_vaginalis==0~0,
                                               Gardnerella_vaginalis>0&(C1+C2+C3+C4+C5+C6>0)~0))

# create rows for MOMS-PI Enriched Samples
rarefiedGardDF <- rarefiedGardDF0 %>%
  filter(SubjectID %in% HMP2highGardSubjects) %>% 
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"),
         SubjectID=paste0(SubjectID, "_E")) %>%
  full_join(rarefiedGardDF0)

Figure S7: Microbial load vs. rarefied clade counts

loadVrarefiedClades <- rarefiedGardDF %>%
  filter(SampleID %in% k100Samples) %>% 
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=(C1+C2+C3+C4+C5+C6), 
         Cohort=factor(Cohort, cohorts)) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=bacLoad)) +
  geom_jitter(alpha=0.25) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 1.5, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Microbial Load")
loadVrarefiedClades

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS7_rarefiedCladesPerSampleVsBacLoad.png", sep = "_")))

Microbial load by taxa presence-absence

bacloadVsTaxWilcox <- mDFtoiClades %>%  
  gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobes)) %>%
  select(SampleID, Cohort, Taxon, relativeAbundance) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
                                      relativeAbundance>=0.001~"+"),
         relativeAbundance=factor(relativeAbundance, levels=c("+", "-")),
         alternative=case_when(Taxon %in% c("Gardnerella_vaginalis", clades, anaerobes)~"greater",
                               Taxon %in% lactos~"less"),
         Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = c(abbrevs)),
         Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI")), 
         x=paste0(Cohort, Taxon)) %>%
  filter(x!="UAB Enr.Lg") %>%
  select(-x) %>%
  group_by(alternative) %>%
  nest %>%
  mutate(data=map(data, ~group_by(.x, Cohort, Taxon))) %>%
  mutate(wilcox_test=map2(data, alternative, ~wilcox_test(data=.x, bacLoad~relativeAbundance, alternative=.y))) %>%
  select(wilcox_test) %>%
  unnest %>%
  adjust_pvalue(method = "BH") %>%
  mutate(p.star=case_when(p.adj>0.05~"",
                          p.adj<0.05&p.adj>0.01~"*",
                          p.adj<0.01&p.adj>0.001~"**",
                          p.adj<0.001~"***"))

Plot

mDFtoiClades %>%  
  gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobes)) %>%
  select(SampleID, Cohort, Taxon, relativeAbundance) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
                                      relativeAbundance>=0.001~"+"),
         Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = c(abbrevs)),
         Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI"))) %>%
  ggplot(aes(x=relativeAbundance, y=bacLoad)) +
  geom_violin(aes(color=relativeAbundance, fill=relativeAbundance), alpha=0.5) +
  geom_text(data =bacloadVsTaxWilcox, aes(y=1, x=1.5, label=p.star), color="black", size=4, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(Cohort~Taxon) +
  scale_y_log10() +
  theme(text = element_text(size=15),
        axis.text = element_text(size = 12),
        legend.position = "none") +
  labs(x="", y="Microbial Load")

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS8_presAbsVsBacLoad.png", sep = "_")))

Figure 4: Gardnerella and increases in microbial load

Microbial load vs. clades per sample and short list microbial load vs. presence-absence

#fig.width=5, fig.height=1.5
bacLoadpresAbs <- mDFtoiClades %>%  
  gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobesSL)) %>%
  select(SampleID, Cohort, Taxon, relativeAbundance) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
                                      relativeAbundance>=0.001~"+"),
         Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobesSL), labels = c(abbrevsSL)),
         Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI"))) %>%
  ggplot(aes(x=relativeAbundance, y=bacLoad)) +
  geom_violin(aes(color=relativeAbundance, fill=relativeAbundance), alpha=0.5) +
  geom_text(data=subset(bacloadVsTaxWilcox, Taxon %in% abbrevsSL), aes(y=1, x=1.5, label=p.star), color="black", size=4, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(Cohort~Taxon) +
  scale_y_log10() +
  theme(text = element_text(size=15),
        axis.text = element_text(size = 12),
        strip.text.y = element_text(size=7.5),
        legend.position = "none") +
  labs(x="", y="Microbial Load")

plot_grid(loadVClades, bacLoadpresAbs, nrow = 1, rel_widths = c(1,2.5), labels = c("A", "B"), label_size = 15)

#ggsave(file.path(figureOut, paste(Sys.Date(), "Figure4_gardAndBacLoad.png", sep="_")))

Gardnerella and the vaginal microbiome

Relative abundance of common taxa

Get the dominant taxon in each sample for organizing – using clades. Define as the taxon with the greatest relative abundance. If no taxon has a relative abundance greater than 30%, specify as “no type”

dominantTax <- gardCladeAb %>%
  spread(Clade, relativeAbundance) %>%
  right_join(mDF, by=c("SampleID", "Cohort")) %>%
  select(-Gardnerella_vaginalis) %>%
  mutate_at(vars(clades), replace_na,  0) %>%
  gather("dominantTax", "abundance", !one_of(c("SampleID", "Cohort"))) %>%
  group_by(SampleID) %>%
  filter(abundance==max(abundance)) %>%
  mutate(dominantTax=case_when(abundance<.3~"No type",
                               abundance>=.3~dominantTax)) %>%
  select(-abundance) %>%
  ungroup

unique(dominantTax$dominantTax)
##  [1] "C1"                                 "No type"                           
##  [3] "C2"                                 "C3"                                
##  [5] "C4"                                 "C5"                                
##  [7] "C6"                                 "Lactobacillus_gasseri"             
##  [9] "Fannyhessea_vaginae"                "Megasphaera_lornae"                
## [11] "Sneathia_vaginalis"                 "Lactobacillus_iners"               
## [13] "Prevotella_bivia"                   "Prevotella_timonensis"             
## [15] "Lactobacillus_crispatus"            "Corynebacterium_amycolatum"        
## [17] "Mycoplasma_hominis"                 "Streptococcus_mitis"               
## [19] "Bifidobacterium_longum"             "Gemella_haemolysans"               
## [21] "Haemophilus_haemolyticus"           "Bacteroides_fragilis"              
## [23] "Lactobacillus_jensenii"             "Escherichia_coli"                  
## [25] "Bifidobacteriaceae_bacterium_NR047" "Bradyrhizobium_viridifuturi"       
## [27] "Enterobacter_cloacae"               "Lactobacillus_helveticus"          
## [29] "Lactobacillus_delbrueckii"          "Enterobacter_hormaechei"
length(unique(dominantTax$dominantTax))
## [1] 30

Create dataframe for order of dominant types to order in barplot figure

dominantTaxOrder <- data.frame(dominantTax=c(clades, lactos, "Fannyhessea_vaginae", "Megasphaera_lornae", "Sneathia_vaginalis", "Prevotella_bivia", "Prevotella_timonensis", "Corynebacterium_amycolatum", "Mycoplasma_hominis", "Streptococcus_mitis", "Bifidobacterium_longum", "Gemella_haemolysans", "Haemophilus_haemolyticus", "Bacteroides_fragilis", "Escherichia_coli", "Bifidobacteriaceae_bacterium_NR047", "Bradyrhizobium_viridifuturi", "Enterobacter_cloacae", "Lactobacillus_helveticus", "Lactobacillus_delbrueckii", "Enterobacter_hormaechei", "No type"),
                               dominantTaxOrder=c(1:30))

Stacked bar plot

# Ordered by dominant taxon 
taxaBarPlot <- mDFtoiClades %>%
  select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
  filter(SampleID!="6744034", 
         SampleID!="6744034_E") %>% # this sample is 100% uncharacterized gard and therefore cannot be displayed in this figure
  mutate(Other=1-C1-C2-C3-C4-C5-C6-Lactobacillus_crispatus-Lactobacillus_gasseri-Lactobacillus_jensenii-Lactobacillus_iners-Fannyhessea_vaginae-Megasphaera_lornae-Prevotella_amnii-Sneathia_vaginalis) %>%
  gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
  mutate(relativeAbundance=na_if(relativeAbundance, 0),
         Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL))) %>%
  left_join(dominantTax, by=c("SampleID", "Cohort")) %>%
  left_join(dominantTaxOrder, by="dominantTax") %>%
  group_by(SampleID) %>%
  ggplot(aes(x=reorder(reorder(SampleID, relativeAbundance, na.rm=TRUE), dominantTaxOrder, FUN=min), y=relativeAbundance, color=Taxon, fill=Taxon)) +
  geom_bar(stat = "identity") +
  theme(axis.text.y = element_text(size=13),
        axis.title = element_text(size=18),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        strip.text = element_text(size=11)) +
  facet_wrap(~Cohort, scales = "free") +
  labs(x="Sample", y="Relative Abundance") +
  taxColorsOther
taxaBarPlot   

# ordered by microbial load
mDFtoiClades %>%
  select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
  mutate(Other=1-(C1+C2+C3+C4+C5+C6+Lactobacillus_crispatus+Lactobacillus_gasseri+Lactobacillus_jensenii+Lactobacillus_iners+Fannyhessea_vaginae+Megasphaera_lornae+Prevotella_amnii+Sneathia_vaginalis)) %>%
  gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
  left_join(bacLoad, by="SampleID") %>%
  mutate(Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL)),
         absoluteAbundance=relativeAbundance*bacLoad) %>%
  ggplot(aes(x=reorder(SampleID, absoluteAbundance, FUN=sum), y=relativeAbundance, color=Taxon, fill=Taxon)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  facet_wrap(~Cohort, scales="free_x") +
  labs(x="Sample", y="Relative Abundance") +
  taxColorsOther

Absolute abundance of common taxa

# By absolute abundance
aaBarPlot <- mDFtoiClades %>%
  select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
  mutate(Other=1-C1-C2-C3-C4-C5-C6-Lactobacillus_crispatus-Lactobacillus_gasseri-Lactobacillus_jensenii-Lactobacillus_iners-Fannyhessea_vaginae-Megasphaera_lornae-Prevotella_amnii-Sneathia_vaginalis) %>%
  gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
  left_join(bacLoad, by="SampleID") %>%
  mutate(Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL)),
         absoluteAbundance=relativeAbundance*bacLoad) %>%
  left_join(bacLoad, by="SampleID") %>%
  ggplot(aes(x=reorder(SampleID, absoluteAbundance, FUN=sum), y=absoluteAbundance, color=Taxon, fill=Taxon)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  facet_wrap(~Cohort, scales="free_x") +
  labs(x="Sample", y="Absolute Abundance") +
  taxColorsOther
aaBarPlot

aaBarPlot +
    coord_cartesian(ylim=c(0,1.5))

What is in the outlier samples?

outlierSamples <- bacLoad %>%
  filter(bacLoad > 2) %>%
  .$SampleID
outlierSamples
## [1] "4009835178" "1005601348" "6744677"    "6743991"    "6744354"   
## [6] "6744871"    "6744459"    "6744677_E"  "6744871_E"

Get top 5 taxa from each outlier sample

mDF %>%
  filter(SampleID %in% outlierSamples) %>%
  gather("Taxon", "Abundance", !contains(c("SampleID", "Cohort"))) %>%
  group_by(SampleID) %>%
  top_n(5) %>%
  arrange(-Abundance) %>%
  split(.$SampleID)
## $`1005601348`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID   Cohort            Taxon                    Abundance
##   <chr>      <fct>             <chr>                        <dbl>
## 1 1005601348 Stanford Enriched Fenollaria_massiliensis     0.247 
## 2 1005601348 Stanford Enriched Prevotella_timonensis       0.156 
## 3 1005601348 Stanford Enriched Gardnerella_vaginalis       0.0847
## 4 1005601348 Stanford Enriched Fastidiosipila_sanguinis    0.0698
## 5 1005601348 Stanford Enriched Prevotella_colorans         0.0690
## 
## $`4009835178`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID   Cohort       Taxon                   Abundance
##   <chr>      <fct>        <chr>                       <dbl>
## 1 4009835178 UAB Enriched Gardnerella_vaginalis      0.181 
## 2 4009835178 UAB Enriched Fannyhessea_vaginae        0.0853
## 3 4009835178 UAB Enriched Prevotella_timonensis      0.0451
## 4 4009835178 UAB Enriched Fenollaria_massiliensis    0.0424
## 5 4009835178 UAB Enriched Finegoldia_magna           0.0286
## 
## $`6743991`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                 Abundance
##   <chr>    <fct>   <chr>                     <dbl>
## 1 6743991  MOMS-PI Prevotella_colorans      0.180 
## 2 6743991  MOMS-PI Porphyromonas_SGB1979    0.0751
## 3 6743991  MOMS-PI Fenollaria_timonensis    0.0635
## 4 6743991  MOMS-PI Prevotella_timonensis    0.0584
## 5 6743991  MOMS-PI GGB79634_SGB5832         0.0416
## 
## $`6744354`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                 Abundance
##   <chr>    <fct>   <chr>                     <dbl>
## 1 6744354  MOMS-PI Prevotella_amnii         0.209 
## 2 6744354  MOMS-PI Gardnerella_vaginalis    0.134 
## 3 6744354  MOMS-PI GGB3012_SGB4003          0.113 
## 4 6744354  MOMS-PI Prevotella_timonensis    0.106 
## 5 6744354  MOMS-PI Sneathia_vaginalis       0.0923
## 
## $`6744459`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                   Abundance
##   <chr>    <fct>   <chr>                       <dbl>
## 1 6744459  MOMS-PI Prevotella_colorans        0.114 
## 2 6744459  MOMS-PI Prevotella_corporis        0.0989
## 3 6744459  MOMS-PI Prevotella_timonensis      0.0810
## 4 6744459  MOMS-PI Varibaculum_massiliense    0.0739
## 5 6744459  MOMS-PI Peptoniphilus_duerdenii    0.0606
## 
## $`6744677`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                 Abundance
##   <chr>    <fct>   <chr>                     <dbl>
## 1 6744677  MOMS-PI Gardnerella_vaginalis    0.691 
## 2 6744677  MOMS-PI Prevotella_amnii         0.100 
## 3 6744677  MOMS-PI Megasphaera_lornae       0.0549
## 4 6744677  MOMS-PI Sneathia_sanguinegens    0.0376
## 5 6744677  MOMS-PI GGB753_SGB989            0.0232
## 
## $`6744677_E`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID  Cohort           Taxon                 Abundance
##   <chr>     <fct>            <chr>                     <dbl>
## 1 6744677_E MOMS-PI Enriched Gardnerella_vaginalis    0.691 
## 2 6744677_E MOMS-PI Enriched Prevotella_amnii         0.100 
## 3 6744677_E MOMS-PI Enriched Megasphaera_lornae       0.0549
## 4 6744677_E MOMS-PI Enriched Sneathia_sanguinegens    0.0376
## 5 6744677_E MOMS-PI Enriched GGB753_SGB989            0.0232
## 
## $`6744871`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                               Abundance
##   <chr>    <fct>   <chr>                                   <dbl>
## 1 6744871  MOMS-PI Gardnerella_vaginalis                  0.724 
## 2 6744871  MOMS-PI Megasphaera_lornae                     0.0681
## 3 6744871  MOMS-PI Sneathia_sanguinegens                  0.0306
## 4 6744871  MOMS-PI Coriobacteriales_bacterium_DNF00809    0.0296
## 5 6744871  MOMS-PI Dialister_micraerophilus               0.0201
## 
## $`6744871_E`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID  Cohort           Taxon                               Abundance
##   <chr>     <fct>            <chr>                                   <dbl>
## 1 6744871_E MOMS-PI Enriched Gardnerella_vaginalis                  0.724 
## 2 6744871_E MOMS-PI Enriched Megasphaera_lornae                     0.0681
## 3 6744871_E MOMS-PI Enriched Sneathia_sanguinegens                  0.0306
## 4 6744871_E MOMS-PI Enriched Coriobacteriales_bacterium_DNF00809    0.0296
## 5 6744871_E MOMS-PI Enriched Dialister_micraerophilus               0.0201

Many have Gardnerella, Prevotella timonensis, or P. amnii

Co occurrence with Jaccard distance

All samples

Long List

#fig.height=5, fig.width=6.5}
jaccards0 <- mDFtoiClades %>%
  select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobes) %>%
  mutate_if(is.numeric, ~case_when(.x>=0.001~1, 
                                   .x<0.001~0)) %>%
  split(.$Cohort) %>%
  map(~column_to_rownames(.x, var="SampleID")) %>%
  map(~select(.x, -Cohort)) %>%
  map(t) %>%
  map(~vegdist(.x, method="jaccard")) %>%
  map(as.matrix)

anaerobeJaccardOrder <- jaccards0 %>%
  map(melt) %>%
  map2(names(.), ~mutate(.x, cohort=.y)) %>%
  purrr::reduce(full_join) %>%
  filter(Var2 %in% anaerobes,
         Var1 %in% c("Gardnerella_vaginalis", clades)) %>%
  with_groups(Var2, summarize, mean=mean(value)) %>%
  arrange(mean) %>%
  .$Var2 %>%
  as.character()
abbrevJaccardOrder <- c("TG", "C1", "C2", "C3", "C4", "C5", "C6", "Lc", "Lg", "Lj", "Li", "Fv", "Ml", "Pa", "Pt", "Sv", "Mh", "Up", "Pb", "Fm")

jaccards <- mDFtoiClades %>%
  select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobeJaccardOrder) %>%
  mutate_if(is.numeric, ~case_when(.x>=0.001~1, 
                                   .x<0.001~0)) %>%
  split(.$Cohort) %>%
  map(~column_to_rownames(.x, var="SampleID")) %>%
  map(~select(.x, -Cohort)) %>%
  map(t) %>%
  map(~vegdist(.x, method="jaccard")) %>%
  map(as.matrix)

jaccards$`Stanford Enriched`[lower.tri(jaccards$`Stanford Enriched`, diag = TRUE)] <- NA
jaccards$`UAB Enriched`[lower.tri(jaccards$`UAB Enriched`, diag = TRUE)] <- NA
jaccards$`MOMS-PI Enriched`[lower.tri(jaccards$`MOMS-PI Enriched`, diag = TRUE)] <- NA
jaccards$`MOMS-PI`[lower.tri(jaccards$`MOMS-PI`, diag = TRUE)] <- NA

jaccards %>%
  map(melt) %>%
  map2(names(.), ~mutate(.x, cohort=.y)) %>%
  purrr::reduce(full_join) %>%
  mutate(Var1=factor(Var1, levels=c("Gardnerella_vaginalis", clades, lactos, anaerobeJaccardOrder), labels = abbrevJaccardOrder), 
         Var2=factor(Var2, levels=rev(c("Gardnerella_vaginalis", clades, lactos, anaerobeJaccardOrder)), labels = rev(abbrevJaccardOrder)),
         cohort=factor(cohort, levels = cohorts)) %>%
  filter(!(Var1=="TG" & Var2 %in% clades),
         !is.na(value)) %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_x_discrete(drop=FALSE) +
  scale_y_discrete(drop=FALSE) +
  scale_fill_gradient2(low = "#56B4E9", mid = "gray", high = "#D55E00", midpoint = 0.5) +
  theme(axis.text.x = element_text(angle=45, hjust=1, size=8),
        axis.text.y = element_text(size=8),
        strip.text.x = element_text(size = 13)) +
  facet_wrap(~cohort) +
  labs(x="", y="", fill="Jaccard Distance")

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS9_jacFourCohort.png", sep="_")), height=6, width=7.5, units="in")

Figure 5: Gardnerella impacts on signatures of the vaginal microbiome

Taxon Abundances and short list of co-occurrences

#fig.width=5.4, fig.height=1.76
jaccardsSL <- mDFtoiClades %>%
  select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobesSL) %>%
  mutate_if(is.numeric, ~case_when(.x>=0.001~1, 
                                   .x<0.001~0)) %>%
  split(.$Cohort) %>%
  map(~column_to_rownames(.x, var="SampleID")) %>%
  map(~select(.x, -Cohort)) %>%
  map(t) %>%
  map(~vegdist(.x, method="jaccard")) %>%
  map(as.matrix)

jaccardsSL$`Stanford Enriched`[lower.tri(jaccardsSL$`Stanford Enriched`, diag = TRUE)] <- NA
jaccardsSL$`UAB Enriched`[lower.tri(jaccardsSL$`UAB Enriched`, diag = TRUE)] <- NA
jaccardsSL$`MOMS-PI Enriched`[lower.tri(jaccardsSL$`MOMS-PI Enriched`, diag = TRUE)] <- NA
jaccardsSL$`MOMS-PI`[lower.tri(jaccardsSL$`MOMS-PI`, diag = TRUE)] <- NA

jaccardFigure <- jaccardsSL %>%
  map(melt) %>%
  map2(names(.), ~mutate(.x, cohort=.y)) %>%
  purrr::reduce(full_join) %>%
  mutate(Var1=factor(Var1, levels=c("Gardnerella_vaginalis", clades, lactos, anaerobesSL), labels = abbrevsSL), 
         Var2=factor(Var2, levels=rev(c("Gardnerella_vaginalis", clades, lactos, anaerobesSL)), labels = rev(abbrevsSL)),
         cohort=factor(cohort, levels=cohorts)) %>%
  filter(!(Var1=="TG" & Var2 %in% clades),
         !is.na(value)) %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_x_discrete(drop=FALSE) +
  scale_y_discrete(drop=FALSE) +
  scale_fill_gradient2(low = "#56B4E9", mid = "gray", high = "#D55E00", midpoint = 0.5) +
  theme(axis.text.x = element_text(angle=45, hjust=1, size=6.5), 
        axis.text.y = element_text(size=7),
        strip.text = element_text(size=11)) +
  facet_wrap(~cohort) +
  coord_fixed() +
  labs(x="", y="", fill="Jaccard Distance")

plot_grid(taxaBarPlot, jaccardFigure, nrow=1, labels = c("A", "B"), label_size = 15, rel_heights = c(1, 1.1))

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "Figure5_microbiomeAndJaccard.png", sep="_")))

Gardnerella variants by PTB status

Clade abundance by PTB status

Relative Abundance

#plot showing clade relative abundance in term vs. preterm birth in all cohorts
allCohortsCladeRelativePTBplot <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella") %>% #SampleID %!in% unchGardSamples,
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
  mutate(var=factor(var, levels = clades, labels = clades),
         Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  binaryColors +
  facet_grid(Cohort~var, scales="free_y") +
  theme(legend.position = "none") +
  labs(x=NULL, y="Relative Abundance")

allCohortsCladeRelativePTBplot

  1. Compute differences between means for each subject and transform
# create function to mutate dataframe to create differences and then observe transformations
meanDifferenceTransformations <- function(inputDF, vars){

  #get list of pairs
  pairs <- combn(vars, 2) %>%
    t %>%
    as.data.frame %>%
    mutate(pairs=paste(V1, V2, sep="-")) %>%
    .$pairs
  
  # calculate differences and transformations of values
  diffs_trans <- inputDF %>%
    mutate(meanAbs=set_names(meanAbs, var)) %>%
    nest(diffs=c(var, meanAbs)) %>%
    mutate(diffs=map(diffs, ~abs(outer(.x$meanAbs, .x$meanAbs, `-`)))) %>% #compute differences
    mutate(diffs=map(diffs, data.frame)) %>%
    mutate(diffs=map(diffs, ~rownames_to_column(.x, var="var1"))) %>%
    mutate(diffs=map(diffs, ~gather(.x, "var2", "diff", vars))) %>%
    mutate(diffs=map(diffs, ~transmute(.x, pair=paste(var1, var2, sep="-"), diff=diff))) %>%
    mutate(diffs=map(diffs, ~filter(.x, pair %in% pairs))) %>% # filter repeats
    mutate(diffs=map(diffs, ~mutate(.x, sqrt=sqrt(diff), #square root transformation
                                          frthrt=diff^(1/4), #fourth root transformation
                                          log=log10(diff + 1e-5)))) %>% #log10 transformation with pseudocount
    unnest
  
  # plot histograms
  plots <- diffs_trans %>%
    gather("Transformation", "Value", c(diff, sqrt, frthrt, log)) %>%
    mutate(Transformation=factor(Transformation, levels=c("diff", "sqrt", "frthrt", "log"), labels=c("Difference", "Square Root", "Fourth Root", "log10 with Pseudocount"))) %>%
    ggplot(aes(x=Value)) +
    geom_histogram() +
    facet_wrap(~Transformation, scales="free")
  
  if(vars==clades){
  # perform shapiro-wilk test for normality. null hypothesis is normality.
  shapiro_results <- map(list(Difference=diffs_trans$diff, sqrt=diffs_trans$sqrt, fourthrt=diffs_trans$frthrt, log=diffs_trans$log), shapiro_test) %>% 
      map2(., names(.), ~mutate(.x, transformation=.y)) %>%
      purrr::reduce(full_join) %>%
      select(transformation, statistic, p.value)
  return(list(diffs_trans=diffs_trans, plots=plots, shapiro_results=shapiro_results))
  }
  
  if(vars==genomos){
    return(list(diffs_trans=diffs_trans, plots=plots))
  }

 }

Assess and choose transformation

cladeRelativeTransfs <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Uncharacterized_Gardnerella", var!="Gardnerella_vaginalis") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, meanAbs=mean(relativeAbundance)) %>%
  meanDifferenceTransformations(., clades)

cladeRelativeTransfs$plots

cladeRelativeTransfs$shapiro_results %>%
  formattable()
transformation statistic p.value
Difference 0.6198463 4.393296e-64
sqrt 0.8524329 2.097396e-47
fourthrt 0.9658575 1.068939e-26
log 0.8869648 3.575165e-43

Choose fourth root transformation

  1. Perform logistic regression on mean differences to see if they vary by term or preterm birth Null hypothesis is that coefficients = 0 Model: \(y_{ij}=\beta_{0i} + x_{ij}\beta_{1ij}+e{ij}\)
# function for performing logistic regression on mean differences of clade abundances
meanDiffLogisticRegression <- function(inputDF, transformation){
  # get cohort Ns
  coh_ns <- gardCladeAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(!is.na(TermPre), var!="Uncharacterized_Gardnerella", var!="Gardnerella_vaginalis") %>%
    group_by(Cohort) %>%
    summarize(cohort_n=n_distinct(SubjectID))
  
  # perform logitic regressions tests
  logiOuts <- inputDF %>%
    gather("transformation", "difference", c(diff, sqrt, frthrt, log)) %>% # filter to choose values with appropriate transformation
    filter(transformation==transformation) %>%
    select(Cohort, TermPre, pair, difference) %>%
    mutate(TermPre=case_when(TermPre=="T"~0,
                             TermPre=="P"~1)) %>%
    split(list(.$Cohort, .$pair)) %>%
    map(~glm(data=.x, TermPre~difference, family="binomial")) %>%
    map(tidy) %>% 
    map2(., names(.), ~mutate(.x, Cohort_Pair=.y)) %>%
    map(~separate(.x, Cohort_Pair, c("Cohort", "Pair"), sep="\\.")) %>%
    purrr::reduce(full_join, by = c("term", "estimate", "std.error", "statistic", "p.value", "Cohort", "Pair")) %>%
    left_join(coh_ns, by="Cohort") %>%
    mutate(std.dev=std.error*sqrt(cohort_n))
  
  logiPlots <- logiOuts %>%
    filter(term=="difference") %>%
    mutate(Cohort=factor(Cohort, levels=cohorts, labels = cohortAbbrevs),
           Pair=factor(Pair, levels = rev(unique(logiOuts$Pair)))) %>%
    ggplot(aes(y=Pair, x=estimate, xmin=estimate-std.dev, xmax=estimate+std.dev)) +
    geom_pointrange() +
    geom_vline(aes(xintercept=0), linetype=2, color="gray", inherit.aes=FALSE) +
    facet_grid(~Cohort) 
  
  return(list(logiOuts=logiOuts, logiPlots=logiPlots))
}
cladeRelMeanDiffRegression <- meanDiffLogisticRegression(cladeRelativeTransfs$diffs_trans, "frthrt")

cladeRelMeanDiffRegression$logiOuts %>%
  filter(term=="difference",
         p.value<0.05) %>%
        nrow
## [1] 0

Figure S11: All cohorts Gardnerella and PTB logistic regression

#, fig.height=2, fig.width=5
plot_grid(allCohortsCladeRelativePTBplot, cladeRelMeanDiffRegression$logiPlots, nrow=1, rel_widths = c(1, 0.75), align = "v", labels = c("A", "B"), label_size = 15)

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS11_allCladesLogisticRegressionPTB.png", sep="_")))

Absolute Abundance

Plot abundances

#plot showing clade absolute abundance in term vs. preterm birth in all cohorts
allCohortsCladeAbsolutePTBplot <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella", bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = clades, labels = clades),
         Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  binaryColors +
  facet_grid(Cohort~var, scales="free_y") +
  theme(legend.position = "none") +
  labs(x=NULL, y="Absolute Abundance")

allCohortsCladeAbsolutePTBplot

Assess and choose transformation

cladeAbsoluteTransfs <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella", bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, meanAbs=mean(absoluteAbundance)) %>%
  meanDifferenceTransformations(., clades)

cladeAbsoluteTransfs$plots

cladeAbsoluteTransfs$shapiro_results %>%
  formattable()
transformation statistic p.value
Difference 0.4604125 6.508211e-71
sqrt 0.7622174 1.723784e-55
fourthrt 0.9432242 3.424417e-33
log 0.9397050 5.216362e-34

Choose log10 + pseudocount transformation

Perform logistic regression on mean differences to see if they vary by term or preterm birth Null hypothesis is that coefficients = 0

cladeAbsMeanDiffRegression <- meanDiffLogisticRegression(cladeAbsoluteTransfs$diffs_trans, "log")

cladeAbsMeanDiffRegression$logiOuts %>%
  filter(term=="difference",
        p.value<0.05) %>%
  nrow
## [1] 0
#fig.height=2, fig.width=5
plot_grid(allCohortsCladeAbsolutePTBplot, cladeAbsMeanDiffRegression$logiPlots, nrow=1, rel_widths = c(1, 0.75), align = "v")

Gardnerella and PTB in MOMS-PI study Only

Due to enrichment in Gardnerella in Stanford and UAB clades. The following analyses will focus on the MOMS-PI cohorts. Create data frame:

momsptbDF <- gardCladeAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(str_detect(Cohort, "MOMS-PI"),
        !is.na(TermPre)) %>%
  mutate(Cohort=as.character(Cohort))

Variant abundance versus PTB

Clades

Wilcox Ranked Sum in MOMS-PI cohort ONLY

cladeRelativeWilcoxResults <- momsptbDF %>%
  filter(Cohort=="MOMS-PI",
         var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Relative=mean(relativeAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Relative~TermPre, alternative="greater")
  
cladeRelativeWilcoxResults %>%
  arrange(p) %>%
  formattable()
var .y. group1 group2 n1 n2 statistic p
C3 Relative P T 43 90 2294.0 0.0416
Gardnerella_vaginalis Relative P T 43 90 2271.5 0.0530
C1 Relative P T 43 90 2111.0 0.1980
C6 Relative P T 43 90 2078.5 0.2270
C5 Relative P T 43 90 2076.0 0.2420
C2 Relative P T 43 90 1953.0 0.4660
C4 Relative P T 43 90 1710.0 0.8610

Wilcoxon rank sum test: Absolute abundance

cladeAbsoluteWilcoxResults <- momsptbDF %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2,
         var!="Uncharacterized_Gardnerella") %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Absolute=mean(absoluteAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Absolute~TermPre, alternative="greater")
  
cladeAbsoluteWilcoxResults %>%
  arrange(p) %>%
  formattable()
var .y. group1 group2 n1 n2 statistic p
C3 Absolute P T 43 90 2267.0 0.0546
Gardnerella_vaginalis Absolute P T 43 90 2262.5 0.0578
C1 Absolute P T 43 90 2166.0 0.1330
C5 Absolute P T 43 90 2115.0 0.1860
C6 Absolute P T 43 90 2086.0 0.2140
C2 Absolute P T 43 90 2082.0 0.2370
C4 Absolute P T 43 90 1842.0 0.6740

Plot

# save p values
cladeWilcoxResults <- cladeRelativeWilcoxResults %>%
  full_join(cladeAbsoluteWilcoxResults, by = c("var", ".y.", "group1", "group2", "n1", "n2", "statistic", "p")) %>%
  mutate(p.star=case_when(p>0.05&p!=0.0578~"",
                          p==0.0578~"^",
                          p<0.05&p>0.01~"*",
                          p<0.01&p>0.001~"**",
                          p<0.001~"***"),
         var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels = c(clades, "Total")))

Relative Abundance Plot:

momspiCladeAbundancePTBPlot <-  momsptbDF %>%
  filter(Cohort=="MOMS-PI",
         var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
  mutate(var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels = c(clades, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = cladeWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=13),
        strip.text.y = element_text(size=9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Relative\nAbundance")
momspiCladeAbundancePTBPlot

Absolute Abundance Plot:

momsptbDF %>%
  filter(Cohort=="MOMS-PI",
         var!="Uncharacterized_Gardnerella") %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels = c(clades, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = cladeWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=13),
        strip.text.y = element_text(size=9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Absolute Abundance")

Genomospecies

Wilcoxon rank sum test for genomospecies, MOMS-PI cohort ONLY

# Relative abundance
genomoRelativeWilcoxResults <- gardGenomoAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(Cohort=="MOMS-PI",
           !is.na(TermPre),
           var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Relative=mean(relativeAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Relative~TermPre, alternative="greater")
  

# Absolute abundance
genomoAbsoluteWilcoxResults <- gardGenomoAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(Cohort=="MOMS-PI",
           !is.na(TermPre),
           var!="Uncharacterized_Gardnerella") %>%
  mutate(Cohort=as.character(Cohort)) %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Absolute=mean(absoluteAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Absolute~TermPre, alternative="greater")
  
genomoRelativeWilcoxResults %>%
  arrange(p) %>%
  formattable
var .y. group1 group2 n1 n2 statistic p
GS10 Relative P T 43 90 2489.0 0.000257
GS9 Relative P T 43 90 2506.0 0.000436
GS8 Relative P T 43 90 2435.0 0.002320
GS14 Relative P T 43 90 2388.0 0.007900
Gardnerella_vaginalis Relative P T 43 90 2271.5 0.053000
GS2 Relative P T 43 90 2220.0 0.073900
GS12 Relative P T 43 90 2133.5 0.151000
GS13 Relative P T 43 90 2082.0 0.200000
GS11 Relative P T 43 90 2026.0 0.271000
GS7 Relative P T 43 90 1997.5 0.372000
GS1 Relative P T 43 90 1980.0 0.414000
GS4 Relative P T 43 90 1944.0 0.482000
GS5 Relative P T 43 90 1922.0 0.527000
GS6 Relative P T 43 90 1868.0 0.628000
GS3 Relative P T 43 90 1854.0 0.662000
genomoAbsoluteWilcoxResults %>%
  arrange(p) %>%
  formattable
var .y. group1 group2 n1 n2 statistic p
GS10 Absolute P T 43 90 2488.0 0.000263
GS9 Absolute P T 43 90 2510.0 0.000401
GS8 Absolute P T 43 90 2435.0 0.002180
GS14 Absolute P T 43 90 2393.0 0.007340
Gardnerella_vaginalis Absolute P T 43 90 2262.5 0.057800
GS2 Absolute P T 43 90 2200.0 0.089200
GS12 Absolute P T 43 90 2156.5 0.125000
GS13 Absolute P T 43 90 2083.0 0.198000
GS11 Absolute P T 43 90 2026.0 0.271000
GS1 Absolute P T 43 90 2034.0 0.315000
GS7 Absolute P T 43 90 1994.5 0.378000
GS5 Absolute P T 43 90 1979.0 0.415000
GS6 Absolute P T 43 90 1962.5 0.448000
GS4 Absolute P T 43 90 1947.0 0.476000
GS3 Absolute P T 43 90 1878.0 0.616000
genomoWilcoxResults <- genomoRelativeWilcoxResults %>%
  full_join(genomoAbsoluteWilcoxResults, by = c("var", ".y.", "group1", "group2", "n1", "n2", "statistic", "p")) %>%
  mutate(p.star=case_when(p>0.05&p!=0.0578~"",
                          p==0.0578~"^",
                          p<0.05&p>0.01~"*",
                          p<0.01&p>0.001~"**",
                          p<0.001~"***"),
         var=factor(var, levels = c(genomosCladeOrder,  "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total")))

Relative Abundance Plot:

momspiGenomoAbundancePTBPlot <-gardGenomoAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre),
         var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
  mutate(var=factor(var, levels = c(genomosCladeOrder,  "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = genomoWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=8),
        strip.text.y = element_text(size = 9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Relative\nAbundance")
momspiGenomoAbundancePTBPlot

Absolute Abundance Plot:

gardGenomoAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre), bacLoad<2,
         var!="Uncharacterized_Gardnerella") %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = c(genomosCladeOrder,  "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = genomoWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=8),
        strip.text.y = element_text(size = 9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Absolute Abundance")

Number of clades by PTB

Mean clades per sample in term vs. preterm birth patients in MOMS-PI Only. Calculate Wilcoxon rank sum test

nCladesWilcox <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>=0.001~1,
                               .x<0.001~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  wilcox_test(meanNClades~TermPre, alternative="greater")

nCladesWilcox %>%
  formattable()
.y. group1 group2 n1 n2 statistic p
meanNClades P T 43 90 2355 0.0217

Plot

momspiCladeCountPlot <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>=0.001~1,
                               .x<0.001~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  ggplot(aes(x=TermPre, y=meanNClades, color=TermPre)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 3, show.legend = FALSE) +
  ylim(0,7) +
  binaryColors +
  labs(x=NULL, y="Mean Gestational\nClade Count") +
  stat_pvalue_manual(nCladesWilcox, label="p", y.position = 6.7)
momspiCladeCountPlot

Assess mean number of clades of rarefying

Perform Wilcoxon Rank Sum test

rarefiedNCladesWilcox <- rarefiedGardDF %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre), SampleID %in% k100Samples) %>% 
  mutate_at(clades, ~case_when(.x>=0.001~1,
                               .x<0.001~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  wilcox_test(meanNClades~TermPre, alternative = "greater")

rarefiedNCladesWilcox %>%
  formattable()  
.y. group1 group2 n1 n2 statistic p
meanNClades P T 42 89 2227 0.0386

Figure S10: Mean gestational clade richness vs. PTB after rarefying

rarefiedGardDF %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre), SampleID %in% k100Samples) %>% 
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  gather("Clade", "Abundance", c(C1, C2, C3, C4, C5, C6)) %>%
  mutate(Abundance=case_when(Abundance>=0.001~1,
                             Abundance<0.001~0)) %>%
  with_groups(c(SampleID, SubjectID, Cohort, TermPre), summarize, nClades=sum(Abundance)) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  ggplot(aes(x=TermPre, y=meanNClades, color=TermPre)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 3, show.legend = FALSE) +
  ylim(0,7) +
  binaryColors +
  labs(x=NULL, y="Mean Gestational\nClade Count") +
  stat_pvalue_manual(rarefiedNCladesWilcox, label="p", y.position = 6.7)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS10_rarefiedCladeRichnessvsPTB.png", sep="_")))

Microbial load by PTB

Perform Wilcoxon Rank Sum tests

bacloadWilcox <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanBacLoad=mean(bacLoad)) %>%
  wilcox_test(meanBacLoad~TermPre, alternative="greater")

bacloadWilcox %>%
  formattable
.y. group1 group2 n1 n2 statistic p
meanBacLoad P T 43 90 2436 0.00803

Plot results

momspiBacLoadPTBplot <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanBacLoad=mean(bacLoad)) %>%
  ggplot(aes(x=TermPre, color=TermPre, y=meanBacLoad)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 2.1, show.legend = FALSE) +
  ylim(0,0.75) +
  binaryColors +
  labs(x=NULL, y="Mean Gestational\nMicrobial Load") +
  stat_pvalue_manual(bacloadWilcox, label="p", y.position = 0.7)

Figure 6: MOMS-PI Cohort Gardnerella and PTB

#, fig.height=3, fig.width=5
momspiPTBAbundancePlots <- plot_grid(momspiCladeAbundancePTBPlot, momspiGenomoAbundancePTBPlot, ncol=1, labels = c("A", "B"), label_size = 15)
momspiCladeCountBacLoadPTBPlots <- plot_grid(momspiCladeCountPlot, momspiBacLoadPTBplot, nrow=1, labels = c("C", "D"), label_size = 15)
plot_grid(momspiPTBAbundancePlots, momspiCladeCountBacLoadPTBPlots, ncol=1, rel_heights = c(1, 0.6))

#ggsave(file.path(figureOut, paste(Sys.Date(), "Figure6_momspiPTB.png", sep="_")))

Library Size by PTB

libSizeWilcox <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "Sickle_pairs")], by="SampleID") %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanLibSize=mean(Sickle_pairs)) %>%
  wilcox_test(meanLibSize~TermPre, alternative="greater")

libSizeWilcox %>%
  formattable
.y. group1 group2 n1 n2 statistic p
meanLibSize P T 43 90 2007 0.365
momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "Sickle_pairs")], by="SampleID") %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanLibSize=mean(Sickle_pairs)) %>%
  ggplot(aes(x=TermPre, color=TermPre, y=meanLibSize)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 2.1, show.legend = FALSE) +
  binaryColors +
  theme(axis.text = element_text(size=13),
        strip.text = element_text(size=13),
        axis.title = element_text(size=16)) +
  labs(x=NULL, y="Mean Subject Library Size") +
  stat_pvalue_manual(libSizeWilcox, label="p", y.position = 3.5e+07)

Gardnerella Clades by Race

Subject average abundance by self reported race

gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "Race")], by="SampleID") %>%
  group_by(SubjectID, Cohort, Race, var) %>%
  summarise(meanRelativeAbundance=mean(relativeAbundance)) %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  mutate(var=factor(var, levels = c(clades, "Gardnerella_vaginalis"), labels = c(clades, "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs),
         Race=factor(Race, levels=c("Asian", "Black", "White", "Other", "Unknown"))) %>%
  ggplot(aes(x=Race, y=meanRelativeAbundance, color=Race, fill=Race)) +
  geom_point(alpha=0.5, position = position_jitterdodge()) +
  geom_boxplot(color="black", alpha=0, outlier.shape = 0) +
  facet_grid(Cohort~var) +
  scale_x_discrete(labels=c("A", "B", "W", "O")) +
  theme(legend.position = "bottom") +
  labs(x=NULL, y="Relative Abundance")

Subject average absolute abundance by race

gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "Race", "bacLoad")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  group_by(SubjectID, Cohort, Race, var) %>%
  summarise(meanAbsoluteAbundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella",  "Gardnerella_vaginalis"), labels = c(clades, "Unch.", "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs),
         Race=factor(Race, levels=c("Asian", "Black", "White", "Other", "Unknown"))) %>%
  ggplot(aes(x=Race, y=meanAbsoluteAbundance, color=Race, fill=Race)) +
  geom_point(alpha=0.5, position = position_jitterdodge()) +
  geom_boxplot(color="black", alpha=0, outlier.shape = 0) +
  facet_grid(Cohort~var) +
  scale_x_discrete(labels=c("A", "B", "W", "O")) +
  theme(legend.position = "bottom") +
  labs(x=NULL, y="Absolute Abundance")

Session Info

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0      rstatix_0.7.2     cowplot_1.1.1     coin_1.4-2       
##  [5] survival_3.5-5    ggExtra_0.10.1    ggbeeswarm_0.7.2  vegan_2.6-4      
##  [9] lattice_0.21-8    permute_0.9-7     reshape2_1.4.4    formattable_0.2.1
## [13] broom_1.0.5       kableExtra_1.3.4  lubridate_1.9.2   forcats_1.0.0    
## [17] stringr_1.5.0     dplyr_1.1.2       purrr_1.0.1       readr_2.1.4      
## [21] tidyr_1.3.0       tibble_3.2.1      ggplot2_3.4.3     tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.1-2      colorspace_2.1-0   ggsignif_0.6.4     ellipsis_0.3.2    
##  [5] modeltools_0.2-23  rstudioapi_0.15.0  farver_2.1.1       bit64_4.0.5       
##  [9] fansi_1.0.4        mvtnorm_1.1-3      xml2_1.3.4         codetools_0.2-19  
## [13] splines_4.1.1      cachem_1.0.8       libcoin_1.0-9      knitr_1.44        
## [17] jsonlite_1.8.4     cluster_2.1.4      shiny_1.7.5        compiler_4.1.1    
## [21] httr_1.4.7         backports_1.4.1    Matrix_1.5-1       fastmap_1.1.1     
## [25] cli_3.6.1          later_1.3.1        htmltools_0.5.5    tools_4.1.1       
## [29] gtable_0.3.4       glue_1.6.2         Rcpp_1.0.10        carData_3.0-5     
## [33] jquerylib_0.1.4    vctrs_0.6.2        svglite_2.1.1      nlme_3.1-162      
## [37] xfun_0.39          ps_1.7.5           rvest_1.0.3        timechange_0.2.0  
## [41] mime_0.12          miniUI_0.1.1.1     lifecycle_1.0.3    MASS_7.3-60       
## [45] zoo_1.8-12         scales_1.2.1       vroom_1.6.3        hms_1.1.3         
## [49] promises_1.2.0.1   parallel_4.1.1     sandwich_3.0-2     yaml_2.3.7        
## [53] sass_0.4.6         stringi_1.7.12     highr_0.10         rlang_1.1.1       
## [57] pkgconfig_2.0.3    systemfonts_1.0.4  matrixStats_0.63.0 evaluate_0.21     
## [61] htmlwidgets_1.6.2  labeling_0.4.3     processx_3.8.1     bit_4.0.5         
## [65] tidyselect_1.2.0   plyr_1.8.8         magrittr_2.0.3     R6_2.5.1          
## [69] magick_2.7.4       generics_0.1.3     multcomp_1.4-25    pillar_1.9.0      
## [73] withr_2.5.0        mgcv_1.8-42        abind_1.4-5        crayon_1.5.2      
## [77] car_3.1-2          utf8_1.2.3         tzdb_0.3.0         rmarkdown_2.24    
## [81] grid_4.1.1         callr_3.7.3        digest_0.6.31      webshot_0.5.5     
## [85] xtable_1.8-4       httpuv_1.6.10      stats4_4.1.1       munsell_0.5.0     
## [89] beeswarm_0.4.0     viridisLite_0.4.2  vipor_0.4.5        bslib_0.5.1